diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdf310fb4b80cb05a8d59296d53245b8ee0447b7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69e010dcbcf1d511216bf00cba786b9c58136e43 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd697f03eb2153c88e1522fb4c897a970d918d65 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7dbb1c1ed8de54272f332c5ec7748bdcc406e4e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7e781e207661467dc1368beb005ab541c39ae50 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f2c58c3aa9e15e6d80137c233bc565fc2508cb4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h new file mode 100644 index 0000000000000000000000000000000000000000..96eadad8a8e8c3979b99910ceea41ceaf2c8b58e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h @@ -0,0 +1,891 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the CUBLAS library, defining the API + * + * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) + * on top of the CUDA runtime. + */ + +#if !defined(CUBLAS_H_) +#define CUBLAS_H_ + +#if defined(CUBLAS_V2_H_) +#error "It is an error to include both cublas.h and cublas_v2.h" +#endif + +#include + +#ifndef CUBLASWINAPI +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif +#endif + +#undef CUBLASAPI +#ifdef __CUDACC__ +#define CUBLASAPI __host__ +#else +#define CUBLASAPI +#endif + +#include "cublas_api.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* CUBLAS data types */ +#define cublasStatus cublasStatus_t + +cublasStatus CUBLASWINAPI cublasInit(void); +cublasStatus CUBLASWINAPI cublasShutdown(void); +cublasStatus CUBLASWINAPI cublasGetError(void); + +cublasStatus CUBLASWINAPI cublasGetVersion(int* version); +cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr); + +cublasStatus CUBLASWINAPI cublasFree(void* devicePtr); + +cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream); + +/* ---------------- CUBLAS BLAS1 functions ---------------- */ +/* NRM2 */ +float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx); +double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx); +float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx); +double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* DOT */ +float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy); +double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy); +cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy); +cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy); +cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy); +cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy); +/*------------------------------------------------------------------------*/ +/* SCAL */ +void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx); +void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx); +void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx); +void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx); + +void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx); +void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* AXPY */ +void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy); +void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy); +void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy); +void CUBLASWINAPI +cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); +/*------------------------------------------------------------------------*/ +/* COPY */ +void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy); +void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy); +void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy); +void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); +/*------------------------------------------------------------------------*/ +/* SWAP */ +void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy); +void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy); +void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy); +void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); +/*------------------------------------------------------------------------*/ +/* AMAX */ +int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx); +int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx); +int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx); +int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* AMIN */ +int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx); +int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx); + +int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx); +int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* ASUM */ +float CUBLASWINAPI cublasSasum(int n, const float* x, int incx); +double CUBLASWINAPI cublasDasum(int n, const double* x, int incx); +float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx); +double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* ROT */ +void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss); +void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss); +void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s); +void CUBLASWINAPI +cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs); +void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s); +void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s); +/*------------------------------------------------------------------------*/ +/* ROTG */ +void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss); +void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss); +void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs); +void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs); +/*------------------------------------------------------------------------*/ +/* ROTM */ +void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam); +void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam); +/*------------------------------------------------------------------------*/ +/* ROTMG */ +void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam); +void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam); + +/* --------------- CUBLAS BLAS2 functions ---------------- */ +/* GEMV */ +void CUBLASWINAPI cublasSgemv(char trans, + int m, + int n, + float alpha, + const float* A, + int lda, + const float* x, + int incx, + float beta, + float* y, + int incy); +void CUBLASWINAPI cublasDgemv(char trans, + int m, + int n, + double alpha, + const double* A, + int lda, + const double* x, + int incx, + double beta, + double* y, + int incy); +void CUBLASWINAPI cublasCgemv(char trans, + int m, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex beta, + cuComplex* y, + int incy); +void CUBLASWINAPI cublasZgemv(char trans, + int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex beta, + cuDoubleComplex* y, + int incy); +/*------------------------------------------------------------------------*/ +/* GBMV */ +void CUBLASWINAPI cublasSgbmv(char trans, + int m, + int n, + int kl, + int ku, + float alpha, + const float* A, + int lda, + const float* x, + int incx, + float beta, + float* y, + int incy); +void CUBLASWINAPI cublasDgbmv(char trans, + int m, + int n, + int kl, + int ku, + double alpha, + const double* A, + int lda, + const double* x, + int incx, + double beta, + double* y, + int incy); +void CUBLASWINAPI cublasCgbmv(char trans, + int m, + int n, + int kl, + int ku, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex beta, + cuComplex* y, + int incy); +void CUBLASWINAPI cublasZgbmv(char trans, + int m, + int n, + int kl, + int ku, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex beta, + cuDoubleComplex* y, + int incy); +/*------------------------------------------------------------------------*/ +/* TRMV */ +void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx); +void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx); +void CUBLASWINAPI +cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx); +void CUBLASWINAPI +cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* TBMV */ +void CUBLASWINAPI +cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx); +void CUBLASWINAPI +cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx); +void CUBLASWINAPI +cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx); +void CUBLASWINAPI cublasZtbmv( + char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* TPMV */ +void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx); + +void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx); + +void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx); + +void CUBLASWINAPI +cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* TRSV */ +void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx); + +void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx); + +void CUBLASWINAPI +cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx); + +void CUBLASWINAPI +cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* TPSV */ +void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx); + +void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx); + +void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx); + +void CUBLASWINAPI +cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* TBSV */ +void CUBLASWINAPI +cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx); + +void CUBLASWINAPI +cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx); +void CUBLASWINAPI +cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx); + +void CUBLASWINAPI cublasZtbsv( + char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx); +/*------------------------------------------------------------------------*/ +/* SYMV/HEMV */ +void CUBLASWINAPI cublasSsymv( + char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy); +void CUBLASWINAPI cublasDsymv(char uplo, + int n, + double alpha, + const double* A, + int lda, + const double* x, + int incx, + double beta, + double* y, + int incy); +void CUBLASWINAPI cublasChemv(char uplo, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex beta, + cuComplex* y, + int incy); +void CUBLASWINAPI cublasZhemv(char uplo, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex beta, + cuDoubleComplex* y, + int incy); +/*------------------------------------------------------------------------*/ +/* SBMV/HBMV */ +void CUBLASWINAPI cublasSsbmv(char uplo, + int n, + int k, + float alpha, + const float* A, + int lda, + const float* x, + int incx, + float beta, + float* y, + int incy); +void CUBLASWINAPI cublasDsbmv(char uplo, + int n, + int k, + double alpha, + const double* A, + int lda, + const double* x, + int incx, + double beta, + double* y, + int incy); +void CUBLASWINAPI cublasChbmv(char uplo, + int n, + int k, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex beta, + cuComplex* y, + int incy); +void CUBLASWINAPI cublasZhbmv(char uplo, + int n, + int k, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex beta, + cuDoubleComplex* y, + int incy); +/*------------------------------------------------------------------------*/ +/* SPMV/HPMV */ +void CUBLASWINAPI +cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy); +void CUBLASWINAPI cublasDspmv( + char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy); +void CUBLASWINAPI cublasChpmv(char uplo, + int n, + cuComplex alpha, + const cuComplex* AP, + const cuComplex* x, + int incx, + cuComplex beta, + cuComplex* y, + int incy); +void CUBLASWINAPI cublasZhpmv(char uplo, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* AP, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex beta, + cuDoubleComplex* y, + int incy); + +/*------------------------------------------------------------------------*/ +/* GER */ +void CUBLASWINAPI +cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda); +void CUBLASWINAPI +cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda); + +void CUBLASWINAPI cublasCgeru( + int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda); +void CUBLASWINAPI cublasCgerc( + int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda); +void CUBLASWINAPI cublasZgeru(int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); +void CUBLASWINAPI cublasZgerc(int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); +/*------------------------------------------------------------------------*/ +/* SYR/HER */ +void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda); +void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda); + +void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda); +void CUBLASWINAPI +cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda); + +/*------------------------------------------------------------------------*/ +/* SPR/HPR */ +void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP); +void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP); +void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP); +void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP); +/*------------------------------------------------------------------------*/ +/* SYR2/HER2 */ +void CUBLASWINAPI +cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda); +void CUBLASWINAPI +cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda); +void CUBLASWINAPI cublasCher2(char uplo, + int n, + cuComplex alpha, + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); +void CUBLASWINAPI cublasZher2(char uplo, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +/*------------------------------------------------------------------------*/ +/* SPR2/HPR2 */ +void CUBLASWINAPI +cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP); +void CUBLASWINAPI +cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP); +void CUBLASWINAPI cublasChpr2( + char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP); +void CUBLASWINAPI cublasZhpr2(char uplo, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* AP); +/* ------------------------BLAS3 Functions ------------------------------- */ +/* GEMM */ +void CUBLASWINAPI cublasSgemm(char transa, + char transb, + int m, + int n, + int k, + float alpha, + const float* A, + int lda, + const float* B, + int ldb, + float beta, + float* C, + int ldc); +void CUBLASWINAPI cublasDgemm(char transa, + char transb, + int m, + int n, + int k, + double alpha, + const double* A, + int lda, + const double* B, + int ldb, + double beta, + double* C, + int ldc); +void CUBLASWINAPI cublasCgemm(char transa, + char transb, + int m, + int n, + int k, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex beta, + cuComplex* C, + int ldc); +void CUBLASWINAPI cublasZgemm(char transa, + char transb, + int m, + int n, + int k, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex beta, + cuDoubleComplex* C, + int ldc); +/* -------------------------------------------------------*/ +/* SYRK */ +void CUBLASWINAPI +cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc); +void CUBLASWINAPI cublasDsyrk( + char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc); + +void CUBLASWINAPI cublasCsyrk(char uplo, + char trans, + int n, + int k, + cuComplex alpha, + const cuComplex* A, + int lda, + cuComplex beta, + cuComplex* C, + int ldc); +void CUBLASWINAPI cublasZsyrk(char uplo, + char trans, + int n, + int k, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex beta, + cuDoubleComplex* C, + int ldc); +/* ------------------------------------------------------- */ +/* HERK */ +void CUBLASWINAPI cublasCherk( + char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc); +void CUBLASWINAPI cublasZherk(char uplo, + char trans, + int n, + int k, + double alpha, + const cuDoubleComplex* A, + int lda, + double beta, + cuDoubleComplex* C, + int ldc); +/* ------------------------------------------------------- */ +/* SYR2K */ +void CUBLASWINAPI cublasSsyr2k(char uplo, + char trans, + int n, + int k, + float alpha, + const float* A, + int lda, + const float* B, + int ldb, + float beta, + float* C, + int ldc); + +void CUBLASWINAPI cublasDsyr2k(char uplo, + char trans, + int n, + int k, + double alpha, + const double* A, + int lda, + const double* B, + int ldb, + double beta, + double* C, + int ldc); +void CUBLASWINAPI cublasCsyr2k(char uplo, + char trans, + int n, + int k, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex beta, + cuComplex* C, + int ldc); + +void CUBLASWINAPI cublasZsyr2k(char uplo, + char trans, + int n, + int k, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex beta, + cuDoubleComplex* C, + int ldc); +/* ------------------------------------------------------- */ +/* HER2K */ +void CUBLASWINAPI cublasCher2k(char uplo, + char trans, + int n, + int k, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + float beta, + cuComplex* C, + int ldc); + +void CUBLASWINAPI cublasZher2k(char uplo, + char trans, + int n, + int k, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + double beta, + cuDoubleComplex* C, + int ldc); + +/*------------------------------------------------------------------------*/ +/* SYMM*/ +void CUBLASWINAPI cublasSsymm(char side, + char uplo, + int m, + int n, + float alpha, + const float* A, + int lda, + const float* B, + int ldb, + float beta, + float* C, + int ldc); +void CUBLASWINAPI cublasDsymm(char side, + char uplo, + int m, + int n, + double alpha, + const double* A, + int lda, + const double* B, + int ldb, + double beta, + double* C, + int ldc); + +void CUBLASWINAPI cublasCsymm(char side, + char uplo, + int m, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex beta, + cuComplex* C, + int ldc); + +void CUBLASWINAPI cublasZsymm(char side, + char uplo, + int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex beta, + cuDoubleComplex* C, + int ldc); +/*------------------------------------------------------------------------*/ +/* HEMM*/ +void CUBLASWINAPI cublasChemm(char side, + char uplo, + int m, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex beta, + cuComplex* C, + int ldc); +void CUBLASWINAPI cublasZhemm(char side, + char uplo, + int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex beta, + cuDoubleComplex* C, + int ldc); + +/*------------------------------------------------------------------------*/ +/* TRSM*/ +void CUBLASWINAPI cublasStrsm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + float alpha, + const float* A, + int lda, + float* B, + int ldb); + +void CUBLASWINAPI cublasDtrsm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + double alpha, + const double* A, + int lda, + double* B, + int ldb); + +void CUBLASWINAPI cublasCtrsm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + cuComplex* B, + int ldb); + +void CUBLASWINAPI cublasZtrsm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* B, + int ldb); +/*------------------------------------------------------------------------*/ +/* TRMM*/ +void CUBLASWINAPI cublasStrmm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + float alpha, + const float* A, + int lda, + float* B, + int ldb); +void CUBLASWINAPI cublasDtrmm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + double alpha, + const double* A, + int lda, + double* B, + int ldb); +void CUBLASWINAPI cublasCtrmm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + cuComplex alpha, + const cuComplex* A, + int lda, + cuComplex* B, + int ldb); +void CUBLASWINAPI cublasZtrmm(char side, + char uplo, + char transa, + char diag, + int m, + int n, + cuDoubleComplex alpha, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* B, + int ldb); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h new file mode 100644 index 0000000000000000000000000000000000000000..fe0e6f99b952514874c45208e751f5330e71570c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h @@ -0,0 +1,693 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library + +*/ + +#if !defined(CUBLAS_XT_H_) +#define CUBLAS_XT_H_ + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#include "cublas_v2.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +struct cublasXtContext; +typedef struct cublasXtContext* cublasXtHandle_t; + +cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle); +cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle); +cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards); +cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards); +/* This routine selects the Gpus that the user want to use for CUBLAS-XT */ +cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]); + +/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */ +cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim); +cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim); + +typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t; +/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed + are not pinned : Pinning/Unpinning the Host memory is still a costly operation + It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary) +*/ +cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode); +cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode); + +/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */ +typedef enum { + CUBLASXT_FLOAT = 0, + CUBLASXT_DOUBLE = 1, + CUBLASXT_COMPLEX = 2, + CUBLASXT_DOUBLECOMPLEX = 3, +} cublasXtOpType_t; + +typedef enum { + CUBLASXT_GEMM = 0, + CUBLASXT_SYRK = 1, + CUBLASXT_HERK = 2, + CUBLASXT_SYMM = 3, + CUBLASXT_HEMM = 4, + CUBLASXT_TRSM = 5, + CUBLASXT_SYR2K = 6, + CUBLASXT_HER2K = 7, + + CUBLASXT_SPMM = 8, + CUBLASXT_SYRKX = 9, + CUBLASXT_HERKX = 10, + CUBLASXT_TRMM = 11, + CUBLASXT_ROUTINE_MAX = 12, +} cublasXtBlasOp_t; + +/* Currently only 32-bit integer BLAS routines are supported */ +cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle, + cublasXtBlasOp_t blasOp, + cublasXtOpType_t type, + void* blasFunctor); + +/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */ +cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle, + cublasXtBlasOp_t blasOp, + cublasXtOpType_t type, + float ratio); + +/* GEMM */ +cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + size_t m, + size_t n, + size_t k, + const float* alpha, + const float* A, + size_t lda, + const float* B, + size_t ldb, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + size_t m, + size_t n, + size_t k, + const double* alpha, + const double* A, + size_t lda, + const double* B, + size_t ldb, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + size_t m, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + size_t m, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); +/* ------------------------------------------------------- */ +/* SYRK */ +cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const float* alpha, + const float* A, + size_t lda, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const double* alpha, + const double* A, + size_t lda, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); +/* -------------------------------------------------------------------- */ +/* HERK */ +cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const float* alpha, + const cuComplex* A, + size_t lda, + const float* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const double* alpha, + const cuDoubleComplex* A, + size_t lda, + const double* beta, + cuDoubleComplex* C, + size_t ldc); +/* -------------------------------------------------------------------- */ +/* SYR2K */ +cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const float* alpha, + const float* A, + size_t lda, + const float* B, + size_t ldb, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const double* alpha, + const double* A, + size_t lda, + const double* B, + size_t ldb, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); +/* -------------------------------------------------------------------- */ +/* HERKX : variant extension of HERK */ +cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const float* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const double* beta, + cuDoubleComplex* C, + size_t ldc); + +/* -------------------------------------------------------------------- */ +/* TRSM */ +cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const float* alpha, + const float* A, + size_t lda, + float* B, + size_t ldb); + +cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const double* alpha, + const double* A, + size_t lda, + double* B, + size_t ldb); + +cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + cuComplex* B, + size_t ldb); + +cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + cuDoubleComplex* B, + size_t ldb); +/* -------------------------------------------------------------------- */ +/* SYMM : Symmetric Multiply Matrix*/ +cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const float* alpha, + const float* A, + size_t lda, + const float* B, + size_t ldb, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const double* alpha, + const double* A, + size_t lda, + const double* B, + size_t ldb, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); +/* -------------------------------------------------------------------- */ +/* HEMM : Hermitian Matrix Multiply */ +cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); + +/* -------------------------------------------------------------------- */ +/* SYRKX : variant extension of SYRK */ +cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const float* alpha, + const float* A, + size_t lda, + const float* B, + size_t ldb, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const double* alpha, + const double* A, + size_t lda, + const double* B, + size_t ldb, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); +/* -------------------------------------------------------------------- */ +/* HER2K : variant extension of HERK */ +cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + const float* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + size_t n, + size_t k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + const double* beta, + cuDoubleComplex* C, + size_t ldc); + +/* -------------------------------------------------------------------- */ +/* SPMM : Symmetric Packed Multiply Matrix*/ +cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const float* alpha, + const float* AP, + const float* B, + size_t ldb, + const float* beta, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const double* alpha, + const double* AP, + const double* B, + size_t ldb, + const double* beta, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuComplex* alpha, + const cuComplex* AP, + const cuComplex* B, + size_t ldb, + const cuComplex* beta, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + size_t m, + size_t n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* AP, + const cuDoubleComplex* B, + size_t ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + size_t ldc); + +/* -------------------------------------------------------------------- */ +/* TRMM */ +cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const float* alpha, + const float* A, + size_t lda, + const float* B, + size_t ldb, + float* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const double* alpha, + const double* A, + size_t lda, + const double* B, + size_t ldb, + double* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const cuComplex* alpha, + const cuComplex* A, + size_t lda, + const cuComplex* B, + size_t ldb, + cuComplex* C, + size_t ldc); + +cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + size_t m, + size_t n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + size_t lda, + const cuDoubleComplex* B, + size_t ldb, + cuDoubleComplex* C, + size_t ldc); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_XT_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h new file mode 100644 index 0000000000000000000000000000000000000000..29ea9153faf7b3e62a6d53c0be1980ae79c49f51 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h @@ -0,0 +1,824 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(NVBLAS_H_) +#define NVBLAS_H_ + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* GEMM */ +void sgemm_(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dgemm_(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void cgemm_(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zgemm_(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +void sgemm(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dgemm(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void cgemm(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zgemm(const char* transa, + const char* transb, + const int* m, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* SYRK */ +void ssyrk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* beta, + float* c, + const int* ldc); + +void dsyrk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* beta, + double* c, + const int* ldc); + +void csyrk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsyrk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +void ssyrk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* beta, + float* c, + const int* ldc); + +void dsyrk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* beta, + double* c, + const int* ldc); + +void csyrk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsyrk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* HERK */ +void cherk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const cuComplex* a, + const int* lda, + const float* beta, + cuComplex* c, + const int* ldc); + +void zherk_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const cuDoubleComplex* a, + const int* lda, + const double* beta, + cuDoubleComplex* c, + const int* ldc); + +void cherk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const cuComplex* a, + const int* lda, + const float* beta, + cuComplex* c, + const int* ldc); + +void zherk(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const cuDoubleComplex* a, + const int* lda, + const double* beta, + cuDoubleComplex* c, + const int* ldc); + +/* TRSM */ +void strsm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + float* b, + const int* ldb); + +void dtrsm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + double* b, + const int* ldb); + +void ctrsm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + cuComplex* b, + const int* ldb); + +void ztrsm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + cuDoubleComplex* b, + const int* ldb); + +void strsm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + float* b, + const int* ldb); + +void dtrsm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + double* b, + const int* ldb); + +void ctrsm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + cuComplex* b, + const int* ldb); + +void ztrsm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + cuDoubleComplex* b, + const int* ldb); + +/* SYMM */ +void ssymm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dsymm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void csymm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsymm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +void ssymm(const char* side, + const char* uplo, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dsymm(const char* side, + const char* uplo, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void csymm(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsymm(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* HEMM */ +void chemm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zhemm_(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* HEMM with no underscore*/ +void chemm(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zhemm(const char* side, + const char* uplo, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* SYR2K */ +void ssyr2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dsyr2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void csyr2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsyr2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* SYR2K no_underscore*/ +void ssyr2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const float* alpha, + const float* a, + const int* lda, + const float* b, + const int* ldb, + const float* beta, + float* c, + const int* ldc); + +void dsyr2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const double* alpha, + const double* a, + const int* lda, + const double* b, + const int* ldb, + const double* beta, + double* c, + const int* ldc); + +void csyr2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const cuComplex* beta, + cuComplex* c, + const int* ldc); + +void zsyr2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* c, + const int* ldc); + +/* HERK */ +void cher2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const float* beta, + cuComplex* c, + const int* ldc); + +void zher2k_(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const double* beta, + cuDoubleComplex* c, + const int* ldc); + +/* HER2K with no underscore */ +void cher2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + const cuComplex* b, + const int* ldb, + const float* beta, + cuComplex* c, + const int* ldc); + +void zher2k(const char* uplo, + const char* trans, + const int* n, + const int* k, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + const cuDoubleComplex* b, + const int* ldb, + const double* beta, + cuDoubleComplex* c, + const int* ldc); + +/* TRMM */ +void strmm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + float* b, + const int* ldb); + +void dtrmm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + double* b, + const int* ldb); + +void ctrmm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + cuComplex* b, + const int* ldb); + +void ztrmm_(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + cuDoubleComplex* b, + const int* ldb); + +void strmm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const float* alpha, + const float* a, + const int* lda, + float* b, + const int* ldb); + +void dtrmm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const double* alpha, + const double* a, + const int* lda, + double* b, + const int* ldb); + +void ctrmm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuComplex* alpha, + const cuComplex* a, + const int* lda, + cuComplex* b, + const int* ldb); + +void ztrmm(const char* side, + const char* uplo, + const char* transa, + const char* diag, + const int* m, + const int* n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* a, + const int* lda, + cuDoubleComplex* b, + const int* ldb); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(NVBLAS_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7ed27cc5337cb2e6617930b9fda168ba6d6f133 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c41f007fec4575259e842c5049a2e1f82bceca91 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0666acbb2a280d9956f39295f378c6648d3fb90 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h new file mode 100644 index 0000000000000000000000000000000000000000..cbde94d36caaf494432a29e13aa91ca3dc7d5b51 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h @@ -0,0 +1,758 @@ +// +// NVIDIA_COPYRIGHT_BEGIN +// +// Copyright (c) 2014-2022, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. +// +// NVIDIA_COPYRIGHT_END +// + +#ifndef __NVRTC_H__ +#define __NVRTC_H__ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + + +/*************************************************************************//** + * + * \defgroup error Error Handling + * + * NVRTC defines the following enumeration type and function for API call + * error handling. + * + ****************************************************************************/ + + +/** + * \ingroup error + * \brief The enumerated type nvrtcResult defines API call result codes. + * NVRTC API functions return nvrtcResult to indicate the call + * result. + */ +typedef enum { + NVRTC_SUCCESS = 0, + NVRTC_ERROR_OUT_OF_MEMORY = 1, + NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + NVRTC_ERROR_INVALID_INPUT = 3, + NVRTC_ERROR_INVALID_PROGRAM = 4, + NVRTC_ERROR_INVALID_OPTION = 5, + NVRTC_ERROR_COMPILATION = 6, + NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, + NVRTC_ERROR_INTERNAL_ERROR = 11 +} nvrtcResult; + + +/** + * \ingroup error + * \brief nvrtcGetErrorString is a helper function that returns a string + * describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to + * \c "NVRTC_SUCCESS". + * For unrecognized enumeration values, it returns + * \c "NVRTC_ERROR unknown". + * + * \param [in] result CUDA Runtime Compilation API result code. + * \return Message string for the given #nvrtcResult code. + */ +const char *nvrtcGetErrorString(nvrtcResult result); + + +/*************************************************************************//** + * + * \defgroup query General Information Query + * + * NVRTC defines the following function for general information query. + * + ****************************************************************************/ + + +/** + * \ingroup query + * \brief nvrtcVersion sets the output parameters \p major and \p minor + * with the CUDA Runtime Compilation version number. + * + * \param [out] major CUDA Runtime Compilation major version number. + * \param [out] minor CUDA Runtime Compilation minor version number. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * + */ +nvrtcResult nvrtcVersion(int *major, int *minor); + + +/** + * \ingroup query + * \brief nvrtcGetNumSupportedArchs sets the output parameter \p numArchs + * with the number of architectures supported by NVRTC. This can + * then be used to pass an array to ::nvrtcGetSupportedArchs to + * get the supported architectures. + * + * \param [out] numArchs number of supported architectures. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * + * see ::nvrtcGetSupportedArchs + */ +nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs); + + +/** + * \ingroup query + * \brief nvrtcGetSupportedArchs populates the array passed via the output parameter + * \p supportedArchs with the architectures supported by NVRTC. The array is + * sorted in the ascending order. The size of the array to be passed can be + * determined using ::nvrtcGetNumSupportedArchs. + * + * \param [out] supportedArchs sorted array of supported architectures. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * + * see ::nvrtcGetNumSupportedArchs + */ +nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs); + + +/*************************************************************************//** + * + * \defgroup compilation Compilation + * + * NVRTC defines the following type and functions for actual compilation. + * + ****************************************************************************/ + + +/** + * \ingroup compilation + * \brief nvrtcProgram is the unit of compilation, and an opaque handle for + * a program. + * + * To compile a CUDA program string, an instance of nvrtcProgram must be + * created first with ::nvrtcCreateProgram, then compiled with + * ::nvrtcCompileProgram. + */ +typedef struct _nvrtcProgram *nvrtcProgram; + + +/** + * \ingroup compilation + * \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the + * given input parameters, and sets the output parameter \p prog with + * it. + * + * \param [out] prog CUDA Runtime Compilation program. + * \param [in] src CUDA program source. + * \param [in] name CUDA program name.\n + * \p name can be \c NULL; \c "default_program" is + * used when \p name is \c NULL or "". + * \param [in] numHeaders Number of headers used.\n + * \p numHeaders must be greater than or equal to 0. + * \param [in] headers Sources of the headers.\n + * \p headers can be \c NULL when \p numHeaders is + * 0. + * \param [in] includeNames Name of each header by which they can be + * included in the CUDA program source.\n + * \p includeNames can be \c NULL when \p numHeaders + * is 0. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcDestroyProgram + */ +nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char * const *headers, + const char * const *includeNames); + + +/** + * \ingroup compilation + * \brief nvrtcDestroyProgram destroys the given program. + * + * \param [in] prog CUDA Runtime Compilation program. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcCreateProgram + */ +nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); + + +/** + * \ingroup compilation + * \brief nvrtcCompileProgram compiles the given program. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] numOptions Number of compiler options passed. + * \param [in] options Compiler options in the form of C string array.\n + * \p options can be \c NULL when \p numOptions is 0. + * + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink + * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink + * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink + * + * It supports compile options listed in \ref options. + */ +nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, + int numOptions, const char * const *options); + + +/** + * \ingroup compilation + * \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX + * generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] ptxSizeRet Size of the generated PTX (including the trailing + * \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTX + */ +nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetPTX stores the PTX generated by the previous compilation + * of \p prog in the memory pointed by \p ptx. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] ptx Compiled result. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTXSize + */ +nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); + + +/** + * \ingroup compilation + * \brief nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin + * generated by the previous compilation of \p prog. The value of + * cubinSizeRet is set to 0 if the value specified to \c -arch is a + * virtual architecture instead of an actual architecture. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] cubinSizeRet Size of the generated cubin. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetCUBIN + */ +nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetCUBIN stores the cubin generated by the previous compilation + * of \p prog in the memory pointed by \p cubin. No cubin is available + * if the value specified to \c -arch is a virtual architecture instead + * of an actual architecture. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] cubin Compiled and assembled result. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetCUBINSize + */ +nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin); + +/** + * \ingroup compilation + * \brief nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM + * generated by the previous compilation of \p prog. The value of + * nvvmSizeRet is set to 0 if the program was not compiled with + * \c -dlto. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] nvvmSizeRet Size of the generated NVVM. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetNVVM + */ +nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetNVVM stores the NVVM generated by the previous compilation + * of \p prog in the memory pointed by \p nvvm. + * The program must have been compiled with -dlto, + * otherwise will return an error. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] nvvm Compiled result. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetNVVMSize + */ +nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm); + +/** + * \ingroup compilation + * \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the + * log generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * Note that compilation log may be generated with warnings and informative + * messages, even when the compilation of \p prog succeeds. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] logSizeRet Size of the compilation log + * (including the trailing \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLog + */ +nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetProgramLog stores the log generated by the previous + * compilation of \p prog in the memory pointed by \p log. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] log Compilation log. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLogSize + */ +nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); + + +/** + * \ingroup compilation + * \brief nvrtcAddNameExpression notes the given name expression + * denoting the address of a __global__ function + * or __device__/__constant__ variable. + * + * The identical name expression string must be provided on a subsequent + * call to nvrtcGetLoweredName to extract the lowered name. + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] name_expression constant expression denoting the address of + * a __global__ function or __device__/__constant__ variable. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink + * + * \see ::nvrtcGetLoweredName + */ +nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, + const char * const name_expression); + +/** + * \ingroup compilation + * \brief nvrtcGetLoweredName extracts the lowered (mangled) name + * for a __global__ function or __device__/__constant__ variable, + * and updates *lowered_name to point to it. The memory containing + * the name is released when the NVRTC program is destroyed by + * nvrtcDestroyProgram. + * The identical name expression must have been previously + * provided to nvrtcAddNameExpression. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] name_expression constant expression denoting the address of + * a __global__ function or __device__/__constant__ variable. + * \param [out] lowered_name initialized by the function to point to a + * C string containing the lowered (mangled) + * name corresponding to the provided name expression. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink + * - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink + * + * \see ::nvrtcAddNameExpression + */ +nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, + const char *const name_expression, + const char** lowered_name); + + +/** + * \defgroup options Supported Compile Options + * + * NVRTC supports the compile options below. + * Option names with two preceding dashs (\c --) are long option names and + * option names with one preceding dash (\c -) are short option names. + * Short option names can be used instead of long option names. + * When a compile option takes an argument, an assignment operator (\c =) + * is used to separate the compile option argument from the compile option + * name, e.g., \c "--gpu-architecture=compute_60". + * Alternatively, the compile option name and the argument can be specified in + * separate strings without an assignment operator, .e.g, + * \c "--gpu-architecture" \c "compute_60". + * Single-character short option names, such as \c -D, \c -U, and \c -I, do + * not require an assignment operator, and the compile option name and the + * argument can be present in the same string with or without spaces between + * them. + * For instance, \c "-D=", \c "-D", and \c "-D " are all + * supported. + * + * The valid compiler options are: + * + * - Compilation targets + * - \c --gpu-architecture=\ (\c -arch)\n + * Specify the name of the class of GPU architectures for which the + * input must be compiled.\n + * - Valid \s: + * - \c compute_35 + * - \c compute_37 + * - \c compute_50 + * - \c compute_52 + * - \c compute_53 + * - \c compute_60 + * - \c compute_61 + * - \c compute_62 + * - \c compute_70 + * - \c compute_72 + * - \c compute_75 + * - \c compute_80 + * - \c compute_87 + * - \c compute_89 + * - \c compute_90 + * - \c sm_35 + * - \c sm_37 + * - \c sm_50 + * - \c sm_52 + * - \c sm_53 + * - \c sm_60 + * - \c sm_61 + * - \c sm_62 + * - \c sm_70 + * - \c sm_72 + * - \c sm_75 + * - \c sm_80 + * - \c sm_87 + * - \c sm_89 + * - \c sm_90 + * - Default: \c compute_52 + * - Separate compilation / whole-program compilation + * - \c --device-c (\c -dc)\n + * Generate relocatable code that can be linked with other relocatable + * device code. It is equivalent to --relocatable-device-code=true. + * - \c --device-w (\c -dw)\n + * Generate non-relocatable code. It is equivalent to + * \c --relocatable-device-code=false. + * - \c --relocatable-device-code={true|false} (\c -rdc)\n + * Enable (disable) the generation of relocatable device code. + * - Default: \c false + * - \c --extensible-whole-program (\c -ewp)\n + * Do extensible whole program compilation of device code. + * - Default: \c false + * - Debugging support + * - \c --device-debug (\c -G)\n + * Generate debug information. If --dopt is not specified, + * then turns off all optimizations. + * - \c --generate-line-info (\c -lineinfo)\n + * Generate line-number information. + * - Code generation + * - \c --dopt on (\c -dopt)\n + * - \c --dopt=on \n + * Enable device code optimization. When specified along with '-G', enables + * limited debug information generation for optimized device code (currently, + * only line number information). + * When '-G' is not specified, '-dopt=on' is implicit. + * - \c --ptxas-options \ (\c -Xptxas)\n + * - \c --ptxas-options=\ \n + * Specify options directly to ptxas, the PTX optimizing assembler. + * - \c --maxrregcount=\ (\c -maxrregcount)\n + * Specify the maximum amount of registers that GPU functions can use. + * Until a function-specific limit, a higher value will generally + * increase the performance of individual GPU threads that execute this + * function. However, because thread registers are allocated from a + * global register pool on each GPU, a higher value of this option will + * also reduce the maximum thread block size, thereby reducing the amount + * of thread parallelism. Hence, a good maxrregcount value is the result + * of a trade-off. If this option is not specified, then no maximum is + * assumed. Value less than the minimum registers required by ABI will + * be bumped up by the compiler to ABI minimum limit. + * - \c --ftz={true|false} (\c -ftz)\n + * When performing single-precision floating-point operations, flush + * denormal values to zero or preserve denormal values. + * \c --use_fast_math implies \c --ftz=true. + * - Default: \c false + * - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n + * For single-precision floating-point square root, use IEEE + * round-to-nearest mode or use a faster approximation. + * \c --use_fast_math implies \c --prec-sqrt=false. + * - Default: \c true + * - \c --prec-div={true|false} (\c -prec-div)\n + * For single-precision floating-point division and reciprocals, use IEEE + * round-to-nearest mode or use a faster approximation. + * \c --use_fast_math implies \c --prec-div=false. + * - Default: \c true + * - \c --fmad={true|false} (\c -fmad)\n + * Enables (disables) the contraction of floating-point multiplies and + * adds/subtracts into floating-point multiply-add operations (FMAD, + * FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true. + * - Default: \c true + * - \c --use_fast_math (\c -use_fast_math)\n + * Make use of fast math operations. + * \c --use_fast_math implies \c --ftz=true \c --prec-div=false + * \c --prec-sqrt=false \c --fmad=true. + * - \c --extra-device-vectorization (\c -extra-device-vectorization)\n + * Enables more aggressive device code vectorization in the NVVM optimizer. + * - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n + * On Linux, during compilation, use \c setrlimit() to increase stack size + * to maximum allowed. The limit is reset to the previous value at the + * end of compilation. + * Note: \c setrlimit() changes the value for the entire process. + * - Default: \c true + * - \c --dlink-time-opt (\c -dlto)\n + * Generate intermediate code for later link-time optimization. + * It implies \c -rdc=true. + * Note: when this is used the nvrtcGetNVVM API should be used, + * as PTX or Cubin will not be generated. + * - Preprocessing + * - \c --define-macro=\ (\c -D)\n + * \c \ can be either \c \ or \c \. + * - \c \ \n + * Predefine \c \ as a macro with definition \c 1. + * - \c \=\ \n + * The contents of \c \ are tokenized and preprocessed + * as if they appeared during translation phase three in a \c \#define + * directive. In particular, the definition will be truncated by + * embedded new line characters. + * - \c --undefine-macro=\ (\c -U)\n + * Cancel any previous definition of \c \. + * - \c --include-path=\ (\c -I)\n + * Add the directory \c \ to the list of directories to be + * searched for headers. These paths are searched after the list of + * headers given to ::nvrtcCreateProgram. + * - \c --pre-include=\ (\c -include)\n + * Preinclude \c \ during preprocessing. + * - \c --no-source-include (\c -no-source-include) + * The preprocessor by default adds the directory of each input sources + * to the include path. This option disables this feature and only + * considers the path specified explicitly. + * - Language Dialect + * - \c --std={c++03|c++11|c++14|c++17} + * (\c -std={c++11|c++14|c++17})\n + * Set language dialect to C++03, C++11, C++14 or C++17 + * - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n + * Provide builtin definitions of \c std::move and \c std::forward, + * when C++11 language dialect is selected. + * - Default: \c true + * - \c --builtin-initializer-list={true|false} + * (\c -builtin-initializer-list)\n + * Provide builtin definitions of \c std::initializer_list class and + * member functions when C++11 language dialect is selected. + * - Default: \c true + * - Misc. + * - \c --disable-warnings (\c -w)\n + * Inhibit all warning messages. + * - \c --restrict (\c -restrict)\n + * Programmer assertion that all kernel pointer parameters are restrict + * pointers. + * - \c --device-as-default-execution-space + * (\c -default-device)\n + * Treat entities with no execution space annotation as \c __device__ + * entities. + * - \c --device-int128 (\c -device-int128)\n + * Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__ + * to be defined. + * - \c --optimization-info=\ (\c -opt-info)\n + * Provide optimization reports for the specified kind of optimization. + * The following kind tags are supported: + * - \c inline : emit a remark when a function is inlined. + * - \c --version-ident={true|false} (\c -dQ)\n + * Embed used compiler's version info into generated PTX/CUBIN + * - Default: \c false + * - \c --display-error-number (\c -err-no)\n + * Display diagnostic number for warning messages. (Default) + * - \c --no-display-error-number (\c -no-err-no)\n + * Disables the display of a diagnostic number for warning messages. + * - \c --diag-error=,... (\c -diag-error)\n + * Emit error for specified diagnostic message number(s). Message numbers can be separated by comma. + * - \c --diag-suppress=,... (\c -diag-suppress)\n + * Suppress specified diagnostic message number(s). Message numbers can be separated by comma. + * - \c --diag-warn=,... (\c -diag-warn)\n + * Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma. + * + */ + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + + +/* The utility function 'nvrtcGetTypeName' is not available by default. Define + the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available. +*/ + +#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__ + +#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__ +#include +#include + +#elif defined(_WIN32) +#include +#include +#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */ + + +#include +#include + +template struct __nvrtcGetTypeName_helper_t { }; + +/*************************************************************************//** + * + * \defgroup hosthelper Host Helper + * + * NVRTC defines the following functions for easier interaction with host code. + * + ****************************************************************************/ + +/** + * \ingroup hosthelper + * \brief nvrtcGetTypeName stores the source level name of a type in the given + * std::string location. + * + * This function is only provided when the macro NVRTC_GET_TYPE_NAME is + * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName + * function calls to extract the type name, when using gcc/clang or cl.exe compilers, + * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR, + * otherwise *result is initialized with the extracted name. + * + * Windows-specific notes: + * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), + * which is not multi-thread safe. + * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl. + * + * \param [in] tinfo: reference to object of type std::type_info for a given type. + * \param [in] result: pointer to std::string in which to store the type name. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink + * + */ +inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result) +{ +#if USE_CXXABI || __clang__ || __GNUC__ + const char *name = tinfo.name(); + int status; + char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status); + if (status == 0) { + *result = undecorated_name; + free(undecorated_name); + return NVRTC_SUCCESS; + } +#elif defined(_WIN32) + const char *name = tinfo.raw_name(); + if (!name || *name != '.') { + return NVRTC_ERROR_INTERNAL_ERROR; + } + char undecorated_name[4096]; + //name+1 skips over the '.' prefix + if(UnDecorateSymbolName(name+1, undecorated_name, + sizeof(undecorated_name) / sizeof(*undecorated_name), + //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS. + UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) { + *result = undecorated_name; + return NVRTC_SUCCESS; + } +#endif /* USE_CXXABI || __clang__ || __GNUC__ */ + + return NVRTC_ERROR_INTERNAL_ERROR; +} + +/** + * \ingroup hosthelper + * \brief nvrtcGetTypeName stores the source level name of the template type argument + * T in the given std::string location. + * + * This function is only provided when the macro NVRTC_GET_TYPE_NAME is + * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName + * function calls to extract the type name, when using gcc/clang or cl.exe compilers, + * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR, + * otherwise *result is initialized with the extracted name. + * + * Windows-specific notes: + * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), + * which is not multi-thread safe. + * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl. + * + * \param [in] result: pointer to std::string in which to store the type name. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink + * + */ + +template +nvrtcResult nvrtcGetTypeName(std::string *result) +{ + nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t), + result); + if (res != NVRTC_SUCCESS) + return res; + + std::string repr = *result; + std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t"); + idx = (idx != std::string::npos) ? repr.find("<", idx) : idx; + std::size_t last_idx = repr.find_last_of('>'); + if (idx == std::string::npos || last_idx == std::string::npos) { + return NVRTC_ERROR_INTERNAL_ERROR; + } + ++idx; + *result = repr.substr(idx, last_idx - idx); + return NVRTC_SUCCESS; +} + +#endif /* NVRTC_GET_TYPE_NAME */ + +#endif /* __NVRTC_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0177739620fd6bc1c2b90e536157e819967b9163 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h new file mode 100644 index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h @@ -0,0 +1,348 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(CU_COMPLEX_H_) +#define CU_COMPLEX_H_ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#endif +#endif + +/* When trying to include C header file in C++ Code extern "C" is required + * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code + * extern "C" cannot be nested + * Hence keep the header out of extern "C" block + */ + +#if !defined(__CUDACC__) +#include /* import fabsf, sqrt */ +#endif /* !defined(__CUDACC__) */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#include "vector_types.h" + +typedef float2 cuFloatComplex; + +__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex + (float r, float i) +{ + cuFloatComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x) +{ + return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x)); +} +__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), + cuCimagf(x) + cuCimagf(y)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), + cuCimagf(x) - cuCimagf(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex prod; + prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) - + (cuCimagf(x) * cuCimagf(y)), + (cuCrealf(x) * cuCimagf(y)) + + (cuCimagf(x) * cuCrealf(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex quot; + float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y)); + float oos = 1.0f / s; + float ars = cuCrealf(x) * oos; + float ais = cuCimagf(x) * oos; + float brs = cuCrealf(y) * oos; + float bis = cuCimagf(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0f / s; + quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* + * We would like to call hypotf(), but it's not available on all platforms. + * This discrete implementation guards against intermediate underflow and + * overflow by scaling. Otherwise we would lose half the exponent range. + * There are various ways of doing guarded computation. For now chose the + * simplest and fastest solution, however this may suffer from inaccuracies + * if sqrt and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) +{ + float a = cuCrealf(x); + float b = cuCimagf(x); + float v, w, t; + a = fabsf(a); + b = fabsf(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0f + t * t; + t = v * sqrtf(t); + if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { + t = v + w; + } + return t; +} + +/* Double precision */ +typedef double2 cuDoubleComplex; + +__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex + (double r, double i) +{ + cuDoubleComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x) +{ + return make_cuDoubleComplex (cuCreal(x), -cuCimag(x)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), + cuCimag(x) + cuCimag(y)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), + cuCimag(x) - cuCimag(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex prod; + prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - + (cuCimag(x) * cuCimag(y)), + (cuCreal(x) * cuCimag(y)) + + (cuCimag(x) * cuCreal(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex quot; + double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y))); + double oos = 1.0 / s; + double ars = cuCreal(x) * oos; + double ais = cuCimag(x) * oos; + double brs = cuCreal(y) * oos; + double bis = cuCimag(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0 / s; + quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Otherwise we would lose half the exponent range. There are + * various ways of doing guarded computation. For now chose the simplest + * and fastest solution, however this may suffer from inaccuracies if sqrt + * and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x) +{ + double a = cuCreal(x); + double b = cuCimag(x); + double v, w, t; + a = fabs(a); + b = fabs(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0 + t * t; + t = v * sqrt(t); + if ((v == 0.0) || + (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) { + t = v + w; + } + return t; +} + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +/* aliases */ +typedef cuFloatComplex cuComplex; +__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, + float y) +{ + return make_cuFloatComplex (x, y); +} + +/* float-to-double promotion */ +__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble + (cuFloatComplex c) +{ + return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat +(cuDoubleComplex c) +{ + return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c)); +} + + +__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d) +{ + float real_res; + float imag_res; + + real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d); + imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d); + + real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res; + imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res; + + return make_cuComplex(real_res, imag_res); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d) +{ + double real_res; + double imag_res; + + real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d); + imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d); + + real_res = -(cuCimag(x) * cuCimag(y)) + real_res; + imag_res = (cuCimag(x) * cuCreal(y)) + imag_res; + + return make_cuDoubleComplex(real_res, imag_res); +} + +#endif /* !defined(CU_COMPLEX_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h new file mode 100644 index 0000000000000000000000000000000000000000..3a7fe8a370330454f8a49e083899a50f7dc527ce --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h @@ -0,0 +1,227 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_H_ +# define _CUDA_AWBARRIER_H_ + +# include "cuda_awbarrier_primitives.h" + +# if !defined(_CUDA_AWBARRIER_SM_TARGET) +# error This file requires compute capability 7.0 or greater. +# endif + +# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER) +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +# endif + +_CUDA_AWBARRIER_BEGIN_NAMESPACE + +class awbarrier { +public: + class arrival_token { + public: + arrival_token() = default; + ~arrival_token() = default; + _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const; + private: + _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token); + uint64_t token; + friend awbarrier; + }; + awbarrier() = default; + awbarrier(const awbarrier&) = delete; + awbarrier& operator=(const awbarrier&) = delete; + ~awbarrier() = default; + + _CUDA_AWBARRIER_QUALIFIER arrival_token arrive(); + _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop(); + _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles); + _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token); + _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait(); + _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max(); +private: + uint64_t barrier; + friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count); + friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier); + friend class pipeline; +}; + +_CUDA_AWBARRIER_QUALIFIER +uint32_t awbarrier::arrival_token::pending_count() const +{ + const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token); +#if (__CUDA_ARCH__ >= 900) + return pending_count; +#else + return (pending_count >> 15); +#endif +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token::arrival_token(uint64_t token) + : token(token) +{ +} + +_CUDA_AWBARRIER_QUALIFIER +void init(awbarrier* barrier, uint32_t expected_count) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT); + +#if (__CUDA_ARCH__ >= 900) + const uint32_t init_count = expected_count; +#else + const uint32_t init_count = (expected_count << 15) + expected_count; +#endif + + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count); +} + +_CUDA_AWBARRIER_QUALIFIER +void inval(awbarrier* barrier) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token awbarrier::arrive() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + #if (__CUDA_ARCH__ < 900) + const uint32_t arrive_count = 1 << 15; + const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete(&this->barrier, arrive_count); + (void) +#else + const uint64_t token = + #endif + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop(&this->barrier); + + return arrival_token(token); +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token awbarrier::arrive_and_drop() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + #if (__CUDA_ARCH__ < 900) + const uint32_t arrive_count = 1 << 15; + const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete(&this->barrier, arrive_count); + (void) +#else + const uint64_t token = + #endif + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop(&this->barrier); + + return arrival_token(token); +} + +_CUDA_AWBARRIER_QUALIFIER +bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles) +{ + constexpr uint64_t max_busy_wait_cycles = 1024; + constexpr uint32_t max_sleep_ns = 1 << 20; + + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) { + return true; + } + + uint64_t start_cycles = clock64(); + uint64_t elapsed_cycles = 0; + uint32_t sleep_ns = 32; + while (elapsed_cycles < hint_cycles) { + if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) { + return true; + } + + if (elapsed_cycles > max_busy_wait_cycles) { + __nanosleep(sleep_ns); + if (sleep_ns < max_sleep_ns) { + sleep_ns *= 2; + } + } + + elapsed_cycles = clock64() - start_cycles; + } + + return false; +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier::wait(arrival_token token) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + while (!timed_wait(token, ~0u)); +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier::arrive_and_wait() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + this->wait(this->arrive()); +} + +_CUDA_AWBARRIER_QUALIFIER __host__ +constexpr uint32_t awbarrier::max() +{ + return _CUDA_AWBARRIER_MAX_COUNT; +} + +_CUDA_AWBARRIER_END_NAMESPACE + +#endif /* !_CUDA_AWBARRIER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..a112fea7830daf2934afff4aa6c14f0787a9f161 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h @@ -0,0 +1,350 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_HELPERS_H_ +#define _CUDA_AWBARRIER_HELPERS_H_ + +#define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental +#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental { +#define _CUDA_AWBARRIER_END_NAMESPACE } } + +#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal +#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal { +#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE + +# if !defined(_CUDA_AWBARRIER_QUALIFIER) +# define _CUDA_AWBARRIER_QUALIFIER inline __device__ +# endif +# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER) +# define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__ +#endif + +#if defined(__CUDA_ARCH__) +#if (__CUDA_ARCH__ >= 800) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80 +#elif (__CUDA_ARCH__ >= 700) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70 +#endif // No support < 700 +#else // !defined(__CUDA_ARCH__) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70 +#endif // defined(__CUDA_ARCH__) + +#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1) + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900))) +# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER +#endif + +#if !defined(_CUDA_AWBARRIER_DEBUG) +# if defined(__CUDACC_DEBUG__) +# define _CUDA_AWBARRIER_DEBUG 1 +# else +# define _CUDA_AWBARRIER_DEBUG 0 +# endif +#endif + +#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG) +# if !defined(__CUDACC_RTC__) +# include +# endif +# define _CUDA_AWBARRIER_ASSERT(x) assert((x)); +# define _CUDA_AWBARRIER_ABORT() assert(0); +#else +# define _CUDA_AWBARRIER_ASSERT(x) +# define _CUDA_AWBARRIER_ABORT() __trap(); +#endif + +#if defined(__CUDACC_RTC__) +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef uint64_t uintptr_t; +#else +# include +#endif + +#if defined(_CUDA_AWBARRIER_SM_TARGET) + +typedef uint64_t __mbarrier_t; +typedef uint64_t __mbarrier_token_t; + +_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE + +extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *); + +namespace _CUDA_AWBARRIER_SM_70 { + union AWBarrier { + struct { + uint32_t expected; + uint32_t pending; + } split; + uint64_t raw; + }; + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); + + AWBarrier* awbarrier = reinterpret_cast(barrier); + + awbarrier->split.expected = 0x40000000 - expected_count; + awbarrier->split.pending = 0x80000000 - expected_count; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_inval(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint32_t __awbarrier_token_pending_count(uint64_t token) { + const uint32_t pending = token >> 32; + return 0x80000000 - (pending & 0x7fffffff); + } + + template + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + AWBarrier* awbarrier = reinterpret_cast(barrier); + + while ((*reinterpret_cast(&awbarrier->split.pending) & 0x7fffffff) == 0); + + if (_Drop) { + (void)atomicAdd_block(&awbarrier->split.expected, 1); + } + + __threadfence_block(); + + const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1); + const uint32_t new_pending = old_pending + 1; + const bool reset = (old_pending ^ new_pending) & 0x80000000; + + if (reset) { + __threadfence_block(); + + uint32_t new_expected = *reinterpret_cast(&awbarrier->split.expected); + new_expected &= ~0x40000000; + if (new_expected & 0x20000000) { + new_expected |= 0x40000000; + } + atomicAdd_block(&awbarrier->split.pending, new_expected); + } + + return static_cast(old_pending) << 32; + } + + template + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); + + AWBarrier* awbarrier = reinterpret_cast(barrier); + + while ((*reinterpret_cast(&awbarrier->split.pending) & 0x7fffffff) == 0); + + if (_Drop) { + (void)atomicAdd_block(&awbarrier->split.expected, count); + } + + return static_cast(atomicAdd_block(&awbarrier->split.pending, count)) << 32; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + volatile AWBarrier* awbarrier = reinterpret_cast(barrier); + + return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000; + } +}; // namespace _CUDA_AWBARRIER_SM_70 + +namespace _CUDA_AWBARRIER_SM_80 { + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); + + asm volatile ("mbarrier.init.shared.b64 [%0], %1;" + : + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count) + : "memory"); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_inval(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + asm volatile ("mbarrier.inval.shared.b64 [%0];" + : + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint32_t __awbarrier_token_pending_count(uint64_t token) { + uint32_t __pending_count; + + asm ("mbarrier.pending_count.b64 %0, %1;" + : "=r"(__pending_count) + : "l"(token)); + return __pending_count; + } + + template + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + uint64_t token; + + if (_Drop) { + asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } else { + asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } + + return token; + } + + template + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); + + uint64_t token; + + if (_Drop) { + asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) + : "memory"); + } else { + asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) + : "memory"); + } + + return token; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + uint16_t __wait_complete; + + asm volatile ("{" + " .reg .pred %%p;" + " mbarrier.test_wait.shared.b64 %%p, [%1], %2;" + " selp.u16 %0, 1, 0, %%p;" + "}" + : "=h"(__wait_complete) + : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token) + : "memory"); + return bool(__wait_complete); + } + +}; // namespace _CUDA_AWBARRIER_SM_80 + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier_init(uint64_t* barrier, uint32_t expected_count) +{ + _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count); +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier_inval(uint64_t* barrier) +{ + _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +uint32_t awbarrier_token_pending_count(uint64_t token) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token); +} + +template +_CUDA_AWBARRIER_QUALIFIER +uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count); +} + +template +_CUDA_AWBARRIER_QUALIFIER +uint64_t awbarrier_arrive_drop(uint64_t* barrier) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token); +} + +_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE + +#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */ + +#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..647110a3351477cdd8a197f53c5877648964ab8e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h @@ -0,0 +1,94 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_ +#define _CUDA_AWBARRIER_PRIMITIVES_H_ + +#include "cuda_awbarrier_helpers.h" + +#if !defined(_CUDA_AWBARRIER_SM_TARGET) +# error This file requires compute capability 7.0 or greater. +#endif + +_CUDA_AWBARRIER_STATIC_QUALIFIER __host__ +uint32_t __mbarrier_maximum_count() { + return _CUDA_AWBARRIER_MAX_COUNT; +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +void __mbarrier_inval(__mbarrier_t* barrier) { + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token); +} + +#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1e3858dc86d7a42270edff1c4ba801007494a38b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp @@ -0,0 +1,2683 @@ +/* +* Copyright 1993-2022 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_BF16_HPP__) +#define __CUDA_BF16_HPP__ + +#if !defined(__CUDA_BF16_H__) +#error "Do not include this file directly. Instead, include cuda_bf16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) +#include +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ +#define __BFLOAT16_TO_US(var) *(reinterpret_cast(&(var))) +#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_UI(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast(&(var))) + +/** +* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __nv_bfloat16_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __nv_bfloat162_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __nv_bfloat16 { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat16() = default; +#else + __CUDA_HOSTDEVICE__ __nv_bfloat16() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Convert to/from __nv_bfloat16_raw */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; } +__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; } +__device__ __forceinline__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80; + h += one; + return ret; +} +__device__ __forceinline__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80; + h -= one; + return ret; +} +/* Unary plus and inverse operators */ +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; } +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __nv_bfloat162 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __nv_bfloat162 { + __nv_bfloat16 x; + __nv_bfloat16 y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat162() = default; + __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __nv_bfloat162() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; } + + /* Convert to/from __nv_bfloat162_raw */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__) + +__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80; + one.y = 0x3F80; + h = __hadd2(h, one); + return ret; +} +__device__ __forceinline__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80; + one.y = 0x3F80; + h = __hsub2(h, one); + return ret; +} +__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; } +__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); } + +#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + +#if defined(__CUDA_ARCH__) + x = __float_as_uint(f); +#elif defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + + if ((x & 0x7fffffffU) > 0x7f800000U) { + sign = 0U; + remainder = 0U; + return static_cast(0x7fffU); + } + sign = x >> 31U; + remainder = x << 16U; + return static_cast(x >> 16U); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x)); + return val; +#else + + float f = static_cast(x); + const double d = static_cast(f); + unsigned int u; + +#if defined(__CUDA_ARCH__) + u = __float_as_uint(f); +#elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); +#else + (void)std::memcpy(&u, &f, sizeof(f)); +#endif + bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U); + + + if ((x > 0.0) && (d > x)) { + u--; + } + if ((x < 0.0) && (d < x)) { + u--; + } + if ((d != x) && x_is_not_nan) { + u |= 1U; + } + +#if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast(u)); +#elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); +#else + (void)std::memcpy(&f, &u, sizeof(f)); +#endif + + return __float2bfloat16(f); + +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +#else + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; + return val; +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +#else + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; + return val; +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a) +{ + __nv_bfloat162 val; +#if __CUDA_ARCH__ >= 800 + asm("{.reg .b16 low;\n" + " cvt.rn.bf16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a)); +#else + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b) +{ + __nv_bfloat162 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b)); +#else + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b)); +#endif + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h) +{ + float f; +#if defined(__CUDA_ARCH__) + #if (__CUDA_ARCH__ >= 900) + asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h)); + #else + asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h)); + #endif +#else + unsigned int u = static_cast(h) << 16; + #if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif +#endif + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x); +} +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y); +} + +#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + +/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */ +__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) +{ + __nv_bfloat162 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a) +{ + __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a) +{ + float hi_float; + float lo_float; + lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x); + hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y); + return make_float2(lo_float, hi_float); +} +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + const float f = __bfloat162float(h); + int i; + i = static_cast(f); +#if !(defined __CUDA_ARCH__) + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i) +{ +#if (defined __CUDA_ARCH__) + #if (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; + #else + const float ru = __int2float_ru(i); + const float rd = __int2float_rd(i); + float rz = __int2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); + #endif +#else + const double d = static_cast(i); + return __double2bfloat16(d); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rz(__int2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rd(__int2float_rd(i)); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_ru(__int2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#elif (defined __CUDA_ARCH__) + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + val = static_cast(f); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } +#endif + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + const float f = static_cast(i); + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rz(__int2float_rz(static_cast(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rd(__int2float_rd(static_cast(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_ru(__int2float_ru(static_cast(i))); +#endif +} + +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + + const float f = __bfloat162float(h); + unsigned int i; + i = static_cast(f); +#if !(defined __CUDA_ARCH__) + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; + +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#elif (defined __CUDA_ARCH__) + const float ru = __uint2float_ru(i); + const float rd = __uint2float_rd(i); + float rz = __uint2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + const double d = static_cast(i); + return __double2bfloat16(d); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rz(__uint2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rd(__uint2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_ru(__uint2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#elif (defined __CUDA_ARCH__) + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + val = static_cast(f); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } +#endif + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + const float f = static_cast(i); + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rz(__uint2float_rz(static_cast(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rd(__uint2float_rd(static_cast(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_ru(__uint2float_ru(static_cast(i))); +#endif +} + +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h) +{ + unsigned long long int i; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + const float f = __bfloat162float(h); + i = static_cast(f); +#if !(defined __CUDA_ARCH__) + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + return i; +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#elif (defined __CUDA_ARCH__) + const float ru = __ull2float_ru(i); + const float rd = __ull2float_rd(i); + float rz = __ull2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + float f = static_cast(i); + const unsigned long long int uf = static_cast(f); + unsigned int u; + + #if defined(__CUDA_ARCH__) + u = __float_as_uint(f); + #elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); + #else + (void)std::memcpy(&u, &f, sizeof(f)); + #endif + + // round up happened here + // note: no need to handle round up to f == 0x1.p64 specially + if (uf > i) { + u--; + } + if (uf != i) { + u |= 1U; + } + + #if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast(u)); + #elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif + + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rz(__ull2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rd(__ull2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_ru(__ull2float_ru(i)); +#endif +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h) +{ + long long int i; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + i = static_cast(f); +#if !(defined __CUDA_ARCH__) + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = min_val; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + return i; +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#elif (defined __CUDA_ARCH__) + const float ru = __ll2float_ru(i); + const float rd = __ll2float_rd(i); + float rz = __ll2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + float f = static_cast(i); + const long long int lf = static_cast(f); + unsigned int u; + + #if defined(__CUDA_ARCH__) + u = __float_as_uint(f); + #elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); + #else + (void)std::memcpy(&u, &f, sizeof(f)); + #endif + + if ((f > 0.0f) && (lf > i)) { + u--; + } + if ((f < 0.0f) && (lf < i)) { + u--; + } + if (lf != i) { + u |= 1U; + } + + #if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast(u)); + #elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif + + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rz(__ll2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rd(__ll2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_ru(__ll2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rz(truncf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_ru(ceilf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rd(floorf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rn(rintf(__bfloat162float(h))); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h))); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h))); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h))); + return __nv_bfloat162(low, high); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h) +{ + return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h))); +} +__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); + return ret; +} +__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a) +{ + int retval; + if (__BFLOAT16_TO_CUS(a) == 0xFF80U) { + retval = -1; + } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat162 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a) +{ + __nv_bfloat162 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h) +{ + return static_cast(__BFLOAT16_TO_CUS(h)); +} +__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h) +{ + return __BFLOAT16_TO_CUS(h); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i) +{ + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = static_cast(i); + return h; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i) +{ + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = i; + return h; +} + +/****************************************************************************** +* __nv_bfloat16, __nv_bfloat162 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\ + __nv_bfloat162 r; \ + asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_SYNC_BFLOAT162_MACRO + +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} + +/****************************************************************************** +* __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} + +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} + +#undef __LDG_PTR +#endif /*defined(__cplusplus) */ +/****************************************************************************** +* __nv_bfloat162 comparison * +******************************************************************************/ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} +#else +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " shr.u32 low_res, low_res, 16;\n"\ + " or.b32 %0, high_res, low_res;}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} +#endif + +__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO + +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + bool retval; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} +#else + +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + unsigned int val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " and.b32 %0, high_res, low_res;}\n"\ + :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return (val != 0U) ? true : false; \ +} +#endif + +__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO +/****************************************************************************** +* __nv_bfloat16 comparison * +******************************************************************************/ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_BF16_STRINGIFY(name) ".bf16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} +#else +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ + unsigned int val; \ + asm( "{.reg .b32 a,b;\n"\ + " mov.b32 a, {0, %1};\n"\ + " mov.b32 b, {0, %2};\n"\ + " set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\ + :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} +#endif +__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(eq) +} +__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ne) +} +__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(le) +} +__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ge) +} +__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(lt) +} +__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(gt) +} +__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(equ) +} +__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(neu) +} +__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(leu) +} +__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(geu) +} +__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ltu) +} +__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(gtu) +} +#undef __COMPARISON_OP_BFLOAT16_MACRO +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\ + __nv_bfloat162 val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " .reg .b16 low,high;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\ + " cvt.rn.bf16.f32 low, low_res;\n"\ + " cvt.rn.bf16.f32 high, high_res;\n"\ + " mov.b32 %0, {low,high};}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f,%1,one,%2;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero, mone;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mone, 0xbf80bf80U;\n" + " fma.rn.bf16x2 f,%2,mone,%1;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero, mzero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mzero, 0x80008000U;\n" + " fma.rn.bf16x2 f,%1,%2,mzero;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ .reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f, %1, %2, %3;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) { + __nv_bfloat16 ha, hb; + + ha = __low2bfloat16(a); + hb = __low2bfloat16(b); + + const __nv_bfloat16 v1 = __hdiv(ha, hb); + + ha = __high2bfloat16(a); + hb = __high2bfloat16(b); + + const __nv_bfloat16 v2 = __hdiv(ha, hb); + + return __halves2bfloat162(v1, v2); +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\ + __nv_bfloat16 val; \ + asm( "{.reg .b32 a,b,res;\n"\ + " mov.b32 a, {0,%1};\n"\ + " mov.b32 b, {0,%2};\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\ + " cvt.rn.bf16.f32 %0, res;}\n"\ + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, one, %2;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero, mone;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mone, 0xbf80U;\n" + " fma.rn.bf16 f, %2, mone, %1;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero, mzero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mzero, 0x8000U;\n" + " fma.rn.bf16 f, %1, %2, mzero;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, %2, %3;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { + __BINARY_OP_BFLOAT16_MACRO(div.rn) +} + +/****************************************************************************** +* __nv_bfloat162 functions * +******************************************************************************/ +#define __APPROX_FCAST(fun) /* do */ {\ + __nv_bfloat16 val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " mov.b32 f,{0,r}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 f,f; \n"\ + " cvt.rn.bf16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __nv_bfloat162 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " mov.b32 fl, {0,hl}; \n"\ + " mov.b32 fu, {0,hu}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fl, fl; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fu, fu; \n"\ + " cvt.rn.bf16.f32 hl, fl; \n"\ + " cvt.rn.bf16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = sinf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) { + return __hsin_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h)); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = cosf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) { + return __hcos_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h)); +} + +#define __BF16_SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" +#define __BF16_SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " mov.b32 f,{0,h}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.bf16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) { + __APPROX_FCAST(ex2) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) { + __APPROX_FCAST2(ex2) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " mov.b32 f, {0,h}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.bf16.f32 r, f; \n" + __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U) + " mov.b16 %0, r; \n" + "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U) + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) { + __APPROX_FCAST(lg2) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) { + __APPROX_FCAST2(lg2) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " mov.b32 f,{0,h}; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.bf16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " mov.b32 f, {0,h}; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.bf16.f32 r, f; \n" + " mov.b16 %0, r; \n" + "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +#undef __BF16_SPEC_CASE2 +#undef __BF16_SPEC_CASE +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) { + __APPROX_FCAST(rcp) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a) +{ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat162 r; + asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +#else + const __nv_bfloat162 b = a; + __BINARY_OP_BFLOAT162_MACRO(set.nan.f32) +#endif +} +__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a) +{ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("{set.nan.bf16.bf16 %0,%1,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return __BFLOAT16_TO_CUS(r) != 0U; +#else + unsigned int r; + asm( "{.reg .b32 a;\n" + " mov.b32 a, {0,%1};\n" + " set.nan.f32.f32 %0, a, a;}\n" + :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a))); + return r != 0U; +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; + asm("{neg.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a) +{ + __nv_bfloat16 r; + asm("{neg.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; + asm("{abs.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a) +{ + __nv_bfloat16 r; + asm("{abs.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ max.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ min.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ max.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ min.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ max.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ min.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x); + __nv_bfloat16 img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_bfloat162(real_tmp, img_tmp); +} + + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat162 r; + asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n" + : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val)) + : "memory"); + return r; +#else + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; + do { + assumed = old; + __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__nv_bfloat162*)&old; +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n" + : "=h"(__BFLOAT16_TO_US(r)) + : __PTR(address), "h"(__BFLOAT16_TO_CUS(val)) + : "memory"); + return r; +#else + unsigned short int* address_as_us = (unsigned short int*)address; + unsigned short int old = *address_as_us, assumed; + do { + assumed = old; + old = atomicCAS(address_as_us, assumed, + __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); + return __ushort_as_bfloat16(old); +#endif +} + +#undef __PTR +#undef __CUDA_BF16_DECL__ +#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ +#endif /* defined(__cplusplus) */ + +#undef __BINARY_OP_BFLOAT162_MACRO +#undef __BINARY_OP_BFLOAT16_MACRO + +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_BF16_DECL__ + +/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ +/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) +typedef __nv_bfloat16 nv_bfloat16; +typedef __nv_bfloat162 nv_bfloat162; + +#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#undef __CPP_VERSION_AT_LEAST_11_BF16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#endif /* end of include guard: __CUDA_BF16_HPP__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h new file mode 100644 index 0000000000000000000000000000000000000000..ffe55709f8ccdebf7341180f043006b68c08e104 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h @@ -0,0 +1,1958 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/** + * CUDA Occupancy Calculator + * + * NAME + * + * cudaOccMaxActiveBlocksPerMultiprocessor, + * cudaOccMaxPotentialOccupancyBlockSize, + * cudaOccMaxPotentialOccupancyBlockSizeVariableSMem + * cudaOccAvailableDynamicSMemPerBlock + * + * DESCRIPTION + * + * The CUDA occupancy calculator provides a standalone, programmatical + * interface to compute the occupancy of a function on a device. It can also + * provide occupancy-oriented launch configuration suggestions. + * + * The function and device are defined by the user through + * cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState + * structures. All APIs require all 3 of them. + * + * See the structure definition for more details about the device / function + * descriptors. + * + * See each API's prototype for API usage. + * + * COMPATIBILITY + * + * The occupancy calculator will be updated on each major CUDA toolkit + * release. It does not provide forward compatibility, i.e. new hardwares + * released after this implementation's release will not be supported. + * + * NOTE + * + * If there is access to CUDA runtime, and the sole intent is to calculate + * occupancy related values on one of the accessible CUDA devices, using CUDA + * runtime's occupancy calculation APIs is recommended. + * + */ + +#ifndef __cuda_occupancy_h__ +#define __cuda_occupancy_h__ + +#include +#include +#include + + +// __OCC_INLINE will be undefined at the end of this header +// +#ifdef __CUDACC__ +#define __OCC_INLINE inline __host__ __device__ +#elif defined _MSC_VER +#define __OCC_INLINE __inline +#else // GNUCC assumed +#define __OCC_INLINE inline +#endif + +enum cudaOccError_enum { + CUDA_OCC_SUCCESS = 0, // no error encountered + CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid + CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in + // current implementation or device is + // invalid +}; +typedef enum cudaOccError_enum cudaOccError; + +typedef struct cudaOccResult cudaOccResult; +typedef struct cudaOccDeviceProp cudaOccDeviceProp; +typedef struct cudaOccFuncAttributes cudaOccFuncAttributes; +typedef struct cudaOccDeviceState cudaOccDeviceState; + +/** + * The CUDA occupancy calculator computes the occupancy of the function + * described by attributes with the given block size (blockSize), static device + * properties (properties), dynamic device states (states) and per-block dynamic + * shared memory allocation (dynamicSMemSize) in bytes, and output it through + * result along with other useful information. The occupancy is computed in + * terms of the maximum number of active blocks per multiprocessor. The user can + * then convert it to other metrics, such as number of active warps. + * + * RETURN VALUE + * + * The occupancy and related information is returned through result. + * + * If result->activeBlocksPerMultiprocessor is 0, then the given parameter + * combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + */ +static __OCC_INLINE +cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( + cudaOccResult *result, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + int blockSize, // in + size_t dynamicSmemSize); // in + +/** + * The CUDA launch configurator C API suggests a grid / block size pair (in + * minGridSize and blockSize) that achieves the best potential occupancy + * (i.e. maximum number of active warps with the smallest number of blocks) for + * the given function described by attributes, on a device described by + * properties with settings in state. + * + * If per-block dynamic shared memory allocation is not needed, the user should + * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if the dynamic + * shared memory size is constant regardless of block size, the size should be + * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be + * NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with different + * block sizes, the user needs to provide a pointer to an unary function through + * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by + * a block of the function for any given block size. dynamicSMemSize is + * ignored. An example signature is: + * + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * + * RETURN VALUE + * + * The suggested block size and the minimum number of blocks needed to achieve + * the maximum occupancy are returned through blockSize and minGridSize. + * + * If *blockSize is 0, then the given combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ +static __OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + size_t (*blockSizeToDynamicSMemSize)(int), // in + size_t dynamicSMemSize); // in + +/** + * The CUDA launch configurator C++ API suggests a grid / block size pair (in + * minGridSize and blockSize) that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number of blocks) + * for the given function described by attributes, on a device described by + * properties with settings in state. + * + * If per-block dynamic shared memory allocation is 0 or constant regardless of + * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to + * configure the launch. A constant dynamic shared memory allocation size in + * bytes can be passed through dynamicSMemSize. + * + * Otherwise, if the per-block dynamic shared memory size varies with different + * block sizes, the user needs to use + * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a + * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that + * computes the dynamic shared memory needed by func for any given block + * size. An example signature is: + * + * // Take block size, returns per-block dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * + * RETURN VALUE + * + * The suggested block size and the minimum number of blocks needed to achieve + * the maximum occupancy are returned through blockSize and minGridSize. + * + * If *blockSize is 0, then the given combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ + +#if defined(__cplusplus) +namespace { + +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + size_t dynamicSMemSize = 0); // in + +template +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + UnaryFunction blockSizeToDynamicSMemSize); // in + +} // namespace anonymous +#endif // defined(__cplusplus) + +/** + * + * The CUDA dynamic shared memory calculator computes the maximum size of + * per-block dynamic shared memory if we want to place numBlocks blocks + * on an SM. + * + * RETURN VALUE + * + * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow + * numBlocks blocks per SM. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ +static __OCC_INLINE +cudaOccError cudaOccAvailableDynamicSMemPerBlock( + size_t *dynamicSmemSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int numBlocks, + int blockSize); + +/** + * Data structures + * + * These structures are subject to change for future architecture and CUDA + * releases. C users should initialize the structure as {0}. + * + */ + +/** + * Device descriptor + * + * This structure describes a device. + */ +struct cudaOccDeviceProp { + int computeMajor; // Compute capability major version + int computeMinor; // Compute capability minor + // version. None supported minor version + // may cause error + int maxThreadsPerBlock; // Maximum number of threads per block + int maxThreadsPerMultiprocessor; // Maximum number of threads per SM + // i.e. (Max. number of warps) x (warp + // size) + int regsPerBlock; // Maximum number of registers per block + int regsPerMultiprocessor; // Maximum number of registers per SM + int warpSize; // Warp size + size_t sharedMemPerBlock; // Maximum shared memory size per block + size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM + int numSms; // Number of SMs available + size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block + size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver + +#ifdef __cplusplus + // This structure can be converted from a cudaDeviceProp structure for users + // that use this header in their CUDA applications. + // + // If the application have access to the CUDA Runtime API, the application + // can obtain the device properties of a CUDA device through + // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the + // cudaDeviceProp structure. + // + // Example: + /* + { + cudaDeviceProp prop; + + cudaGetDeviceProperties(&prop, ...); + + cudaOccDeviceProp occProp = prop; + + ... + + cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...); + } + */ + // + template + __OCC_INLINE + cudaOccDeviceProp(const DeviceProp &props) + : computeMajor (props.major), + computeMinor (props.minor), + maxThreadsPerBlock (props.maxThreadsPerBlock), + maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor), + regsPerBlock (props.regsPerBlock), + regsPerMultiprocessor (props.regsPerMultiprocessor), + warpSize (props.warpSize), + sharedMemPerBlock (props.sharedMemPerBlock), + sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor), + numSms (props.multiProcessorCount), + sharedMemPerBlockOptin (props.sharedMemPerBlockOptin), + reservedSharedMemPerBlock (props.reservedSharedMemPerBlock) + {} + + __OCC_INLINE + cudaOccDeviceProp() + : computeMajor (0), + computeMinor (0), + maxThreadsPerBlock (0), + maxThreadsPerMultiprocessor (0), + regsPerBlock (0), + regsPerMultiprocessor (0), + warpSize (0), + sharedMemPerBlock (0), + sharedMemPerMultiprocessor (0), + numSms (0), + sharedMemPerBlockOptin (0), + reservedSharedMemPerBlock (0) + {} +#endif // __cplusplus +}; + +/** + * Partitioned global caching option + */ +typedef enum cudaOccPartitionedGCConfig_enum { + PARTITIONED_GC_OFF, // Disable partitioned global caching + PARTITIONED_GC_ON, // Prefer partitioned global caching + PARTITIONED_GC_ON_STRICT // Force partitioned global caching +} cudaOccPartitionedGCConfig; + +/** + * Per function opt in maximum dynamic shared memory limit + */ +typedef enum cudaOccFuncShmemConfig_enum { + FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit + FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit +} cudaOccFuncShmemConfig; + +/** + * Function descriptor + * + * This structure describes a CUDA function. + */ +struct cudaOccFuncAttributes { + int maxThreadsPerBlock; // Maximum block size the function can work with. If + // unlimited, use INT_MAX or any value greater than + // or equal to maxThreadsPerBlock of the device + int numRegs; // Number of registers used. When the function is + // launched on device, the register count may change + // due to internal tools requirements. + size_t sharedSizeBytes; // Number of static shared memory used + + cudaOccPartitionedGCConfig partitionedGCConfig; + // Partitioned global caching is required to enable + // caching on certain chips, such as sm_52 + // devices. Partitioned global caching can be + // automatically disabled if the occupancy + // requirement of the launch cannot support caching. + // + // To override this behavior with caching on and + // calculate occupancy strictly according to the + // preference, set partitionedGCConfig to + // PARTITIONED_GC_ON_STRICT. This is especially + // useful for experimenting and finding launch + // configurations (MaxPotentialOccupancyBlockSize) + // that allow global caching to take effect. + // + // This flag only affects the occupancy calculation. + + cudaOccFuncShmemConfig shmemLimitConfig; + // Certain chips like sm_70 allow a user to opt into + // a higher per block limit of dynamic shared memory + // This optin is performed on a per function basis + // using the cuFuncSetAttribute function + + size_t maxDynamicSharedSizeBytes; + // User set limit on maximum dynamic shared memory + // usable by the kernel + // This limit is set using the cuFuncSetAttribute + // function. + + int numBlockBarriers; // Number of block barriers used (default to 1) +#ifdef __cplusplus + // This structure can be converted from a cudaFuncAttributes structure for + // users that use this header in their CUDA applications. + // + // If the application have access to the CUDA Runtime API, the application + // can obtain the function attributes of a CUDA kernel function through + // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the + // cudaFuncAttributes structure. + // + // Example: + /* + __global__ void foo() {...} + + ... + + { + cudaFuncAttributes attr; + + cudaFuncGetAttributes(&attr, foo); + + cudaOccFuncAttributes occAttr = attr; + + ... + + cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...); + } + */ + // + template + __OCC_INLINE + cudaOccFuncAttributes(const FuncAttributes &attr) + : maxThreadsPerBlock (attr.maxThreadsPerBlock), + numRegs (attr.numRegs), + sharedSizeBytes (attr.sharedSizeBytes), + partitionedGCConfig (PARTITIONED_GC_OFF), + shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN), + maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes), + numBlockBarriers (1) + {} + + __OCC_INLINE + cudaOccFuncAttributes() + : maxThreadsPerBlock (0), + numRegs (0), + sharedSizeBytes (0), + partitionedGCConfig (PARTITIONED_GC_OFF), + shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT), + maxDynamicSharedSizeBytes (0), + numBlockBarriers (0) + {} +#endif +}; + +typedef enum cudaOccCacheConfig_enum { + CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default) + CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache + CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory + CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory +} cudaOccCacheConfig; + +typedef enum cudaOccCarveoutConfig_enum { + SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default) + SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache + SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory + SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache +} cudaOccCarveoutConfig; + +/** + * Device state descriptor + * + * This structure describes device settings that affect occupancy calculation. + */ +struct cudaOccDeviceState +{ + // Cache / shared memory split preference. Deprecated on Volta + cudaOccCacheConfig cacheConfig; + // Shared memory / L1 split preference. Supported on only Volta + int carveoutConfig; + +#ifdef __cplusplus + __OCC_INLINE + cudaOccDeviceState() + : cacheConfig (CACHE_PREFER_NONE), + carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT) + {} +#endif +}; + +typedef enum cudaOccLimitingFactor_enum { + // Occupancy limited due to: + OCC_LIMIT_WARPS = 0x01, // - warps available + OCC_LIMIT_REGISTERS = 0x02, // - registers available + OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available + OCC_LIMIT_BLOCKS = 0x08, // - blocks available + OCC_LIMIT_BARRIERS = 0x10 // - barrier available +} cudaOccLimitingFactor; + +/** + * Occupancy output + * + * This structure contains occupancy calculator's output. + */ +struct cudaOccResult { + int activeBlocksPerMultiprocessor; // Occupancy + unsigned int limitingFactors; // Factors that limited occupancy. A bit + // field that counts the limiting + // factors, see cudaOccLimitingFactor + int blockLimitRegs; // Occupancy due to register + // usage, INT_MAX if the kernel does not + // use any register. + int blockLimitSharedMem; // Occupancy due to shared memory + // usage, INT_MAX if the kernel does not + // use shared memory. + int blockLimitWarps; // Occupancy due to block size limit + int blockLimitBlocks; // Occupancy due to maximum number of blocks + // managable per SM + int blockLimitBarriers; // Occupancy due to block barrier usage + int allocatedRegistersPerBlock; // Actual number of registers allocated per + // block + size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated + // per block + cudaOccPartitionedGCConfig partitionedGCConfig; + // Report if partitioned global caching + // is actually enabled. +}; + +/** + * Partitioned global caching support + * + * See cudaOccPartitionedGlobalCachingModeSupport + */ +typedef enum cudaOccPartitionedGCSupport_enum { + PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported + PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported +} cudaOccPartitionedGCSupport; + +/** + * Implementation + */ + +/** + * Max compute capability supported + */ +#define __CUDA_OCC_MAJOR__ 9 +#define __CUDA_OCC_MINOR__ 0 + +////////////////////////////////////////// +// Mathematical Helper Functions // +////////////////////////////////////////// + +static __OCC_INLINE int __occMin(int lhs, int rhs) +{ + return rhs < lhs ? rhs : lhs; +} + +static __OCC_INLINE int __occDivideRoundUp(int x, int y) +{ + return (x + (y - 1)) / y; +} + +static __OCC_INLINE int __occRoundUp(int x, int y) +{ + return y * __occDivideRoundUp(x, y); +} + +////////////////////////////////////////// +// Architectural Properties // +////////////////////////////////////////// + +/** + * Granularity of shared memory allocation + */ +static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + case 7: + value = 256; + break; + case 8: + case 9: + value = 128; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Maximum number of registers per thread + */ +static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + value = 255; + break; + case 7: + case 8: + case 9: + value = 256; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Granularity of register allocation + */ +static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + case 7: + case 8: + case 9: + value = 256; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Number of sub-partitions + */ +static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 7: + case 8: + case 9: + value = 4; + break; + case 6: + value = properties->computeMinor ? 4 : 2; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + + +/** + * Maximum number of blocks that can run simultaneously on a multiprocessor + */ +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + value = 16; + break; + case 5: + case 6: + value = 32; + break; + case 7: { + int isTuring = properties->computeMinor == 5; + value = (isTuring) ? 16 : 32; + break; + } + case 8: + if (properties->computeMinor == 0) { + value = 32; + } + else if (properties->computeMinor == 9) { + value = 24; + } + else { + value = 16; + } + break; + case 9: + value = 32; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Align up shared memory based on compute major configurations + */ +static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties) +{ + // Volta and Turing have shared L1 cache / shared memory, and support cache + // configuration to trade one for the other. These values are needed to + // map carveout config ratio to the next available architecture size + size_t size = *shMemSize; + + switch (properties->computeMajor) { + case 7: { + // Turing supports 32KB and 64KB shared mem. + int isTuring = properties->computeMinor == 5; + if (isTuring) { + if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem. + else { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 96 * 1024) { + *shMemSize = 96 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + break; + } + case 8: + if (properties->computeMinor == 0 || properties->computeMinor == 7) { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else if (size <= 132 * 1024) { + *shMemSize = 132 * 1024; + } + else if (size <= 164 * 1024) { + *shMemSize = 164 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + else { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + break; + case 9: { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else if (size <= 132 * 1024) { + *shMemSize = 132 * 1024; + } + else if (size <= 164 * 1024) { + *shMemSize = 164 * 1024; + } + else if (size <= 196 * 1024) { + *shMemSize = 196 * 1024; + } + else if (size <= 228 * 1024) { + *shMemSize = 228 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + break; + } + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + return CUDA_OCC_SUCCESS; +} + +/** + * Shared memory based on the new carveoutConfig API introduced with Volta + */ +static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + size_t preferenceShmemSize; + + // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported + // devices. This preference will take precedence over the older cacheConfig setting. + // Map cacheConfig to its effective preference value. + int effectivePreference = state->carveoutConfig; + if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { + switch (state->cacheConfig) + { + case CACHE_PREFER_L1: + effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1; + break; + case CACHE_PREFER_SHARED: + effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED; + break; + case CACHE_PREFER_EQUAL: + effectivePreference = SHAREDMEM_CARVEOUT_HALF; + break; + default: + effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT; + break; + } + } + + if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { + preferenceShmemSize = properties->sharedMemPerMultiprocessor; + } + else { + preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100; + } + + status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties); + *limit = preferenceShmemSize; + return status; +} + +/** + * Shared memory based on the cacheConfig + */ +static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + size_t bytes = 0; + size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor; + cudaOccCacheConfig cacheConfig = state->cacheConfig; + + // Kepler has shared L1 cache / shared memory, and support cache + // configuration to trade one for the other. These values are needed to + // calculate the correct shared memory size for user requested cache + // configuration. + // + size_t minCacheSize = 16384; + size_t maxCacheSize = 49152; + size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize; + size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize; + + switch (properties->computeMajor) { + case 3: + // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest + // is shared memory. + // + switch (cacheConfig) { + default : + case CACHE_PREFER_NONE: + case CACHE_PREFER_SHARED: + bytes = sharedMemPerMultiprocessorHigh; + break; + case CACHE_PREFER_L1: + bytes = sharedMemPerMultiprocessorLow; + break; + case CACHE_PREFER_EQUAL: + // Equal is the mid-point between high and low. It should be + // equivalent to low + 16KB. + // + bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2; + break; + } + break; + case 5: + case 6: + // Maxwell and Pascal have dedicated shared memory. + // + bytes = sharedMemPerMultiprocessorHigh; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = bytes; + + return CUDA_OCC_SUCCESS; +} + +/** + * Shared memory based on config requested by User + */ +static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference, + // it is handled separately from the cache config preference. + if (properties->computeMajor >= 7) { + return cudaOccSMemPreferenceVoltaPlus(limit, properties, state); + } + return cudaOccSMemPreference(limit, properties, state); +} + +/** + * Return the per block shared memory limit based on function config + */ +static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta) +{ + switch (properties->computeMajor) { + case 2: + case 3: + case 4: + case 5: + case 6: + *limit = properties->sharedMemPerBlock; + break; + case 7: + case 8: + case 9: + switch (shmemLimitConfig) { + default: + case FUNC_SHMEM_LIMIT_DEFAULT: + *limit = properties->sharedMemPerBlock; + break; + case FUNC_SHMEM_LIMIT_OPTIN: + if (smemPerCta > properties->sharedMemPerBlock) { + *limit = properties->sharedMemPerBlockOptin; + } + else { + *limit = properties->sharedMemPerBlock; + } + break; + } + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + // Starting Ampere, CUDA driver reserves additional shared memory per block + if (properties->computeMajor >= 8) { + *limit += properties->reservedSharedMemPerBlock; + } + + return CUDA_OCC_SUCCESS; +} + +/** + * Partitioned global caching mode support + */ +static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties) +{ + *limit = PARTITIONED_GC_NOT_SUPPORTED; + + if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) || + properties->computeMajor == 6) { + *limit = PARTITIONED_GC_SUPPORTED; + } + + if (properties->computeMajor == 6 && properties->computeMinor == 0) { + *limit = PARTITIONED_GC_NOT_SUPPORTED; + } + + return CUDA_OCC_SUCCESS; +} + +/////////////////////////////////////////////// +// User Input Sanity // +/////////////////////////////////////////////// + +static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties) +{ + // Verify device properties + // + // Each of these limits must be a positive number. + // + // Compute capacity is checked during the occupancy calculation + // + if (properties->maxThreadsPerBlock <= 0 || + properties->maxThreadsPerMultiprocessor <= 0 || + properties->regsPerBlock <= 0 || + properties->regsPerMultiprocessor <= 0 || + properties->warpSize <= 0 || + properties->sharedMemPerBlock <= 0 || + properties->sharedMemPerMultiprocessor <= 0 || + properties->numSms <= 0) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes) +{ + // Verify function attributes + // + if (attributes->maxThreadsPerBlock <= 0 || + attributes->numRegs < 0) { // Compiler may choose not to use + // any register (empty kernels, + // etc.) + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state) +{ + (void)state; // silence unused-variable warning + // Placeholder + // + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccInputCheck( + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + + status = cudaOccDevicePropCheck(properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccFuncAttributesCheck(attributes); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccDeviceStateCheck(state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + return status; +} + +/////////////////////////////////////////////// +// Occupancy calculation Functions // +/////////////////////////////////////////////// + +static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected( + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes) +{ + cudaOccPartitionedGCSupport gcSupport; + cudaOccPartitionedGCConfig gcConfig; + + cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties); + + gcConfig = attributes->partitionedGCConfig; + + if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) { + gcConfig = PARTITIONED_GC_OFF; + } + + return gcConfig; +} + +// Warp limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit( + int *limit, + cudaOccPartitionedGCConfig gcConfig, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + int blockSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int maxWarpsPerSm; + int warpsAllocatedPerCTA; + int maxBlocks; + (void)attributes; // silence unused-variable warning + + if (blockSize > properties->maxThreadsPerBlock) { + maxBlocks = 0; + } + else { + maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize; + warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); + maxBlocks = 0; + + if (gcConfig != PARTITIONED_GC_OFF) { + int maxBlocksPerSmPartition; + int maxWarpsPerSmPartition; + + // If partitioned global caching is on, then a CTA can only use a SM + // partition (a half SM), and thus a half of the warp slots + // available per SM + // + maxWarpsPerSmPartition = maxWarpsPerSm / 2; + maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; + maxBlocks = maxBlocksPerSmPartition * 2; + } + // On hardware that supports partitioned global caching, each half SM is + // guaranteed to support at least 32 warps (maximum number of warps of a + // CTA), so caching will not cause 0 occupancy due to insufficient warp + // allocation slots. + // + else { + maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; + } + } + + *limit = maxBlocks; + + return status; +} + +// Shared memory limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit( + int *limit, + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int blockSize, + size_t dynamicSmemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int allocationGranularity; + size_t userSmemPreference = 0; + size_t totalSmemUsagePerCTA; + size_t maxSmemUsagePerCTA; + size_t smemAllocatedPerCTA; + size_t staticSmemSize; + size_t sharedMemPerMultiprocessor; + size_t smemLimitPerCTA; + int maxBlocks; + int dynamicSmemSizeExceeded = 0; + int totalSmemSizeExceeded = 0; + (void)blockSize; // silence unused-variable warning + + status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Obtain the user preferred shared memory size. This setting is ignored if + // user requests more shared memory than preferred. + // + status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock; + totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize; + smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); + + maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes; + + dynamicSmemSizeExceeded = 0; + totalSmemSizeExceeded = 0; + + // Obtain the user set maximum dynamic size if it exists + // If so, the current launch dynamic shared memory must not + // exceed the set limit + if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT && + dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) { + dynamicSmemSizeExceeded = 1; + } + + status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + if (smemAllocatedPerCTA > smemLimitPerCTA) { + totalSmemSizeExceeded = 1; + } + + if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) { + maxBlocks = 0; + } + else { + // User requested shared memory limit is used as long as it is greater + // than the total shared memory used per CTA, i.e. as long as at least + // one CTA can be launched. + if (userSmemPreference >= smemAllocatedPerCTA) { + sharedMemPerMultiprocessor = userSmemPreference; + } + else { + // On Volta+, user requested shared memory will limit occupancy + // if it's less than shared memory per CTA. Otherwise, the + // maximum shared memory limit is used. + if (properties->computeMajor >= 7) { + sharedMemPerMultiprocessor = smemAllocatedPerCTA; + status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + } + else { + sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor; + } + } + + if (smemAllocatedPerCTA > 0) { + maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); + } + else { + maxBlocks = INT_MAX; + } + } + + result->allocatedSharedMemPerBlock = smemAllocatedPerCTA; + + *limit = maxBlocks; + + return status; +} + +static __OCC_INLINE +cudaOccError cudaOccMaxBlocksPerSMRegsLimit( + int *limit, + cudaOccPartitionedGCConfig *gcConfig, + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + int blockSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int allocationGranularity; + int warpsAllocatedPerCTA; + int regsAllocatedPerCTA; + int regsAssumedPerCTA; + int regsPerWarp; + int regsAllocatedPerWarp; + int numSubPartitions; + int numRegsPerSubPartition; + int numWarpsPerSubPartition; + int numWarpsPerSM; + int maxBlocks; + int maxRegsPerThread; + + status = cudaOccRegAllocationGranularity( + &allocationGranularity, + properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccRegAllocationMaxPerThread( + &maxRegsPerThread, + properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); + + // GPUs of compute capability 2.x and higher allocate registers to warps + // + // Number of regs per warp is regs per thread x warp size, rounded up to + // register allocation granularity + // + regsPerWarp = attributes->numRegs * properties->warpSize; + regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); + regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; + + // Hardware verifies if a launch fits the per-CTA register limit. For + // historical reasons, the verification logic assumes register + // allocations are made to all partitions simultaneously. Therefore, to + // simulate the hardware check, the warp allocation needs to be rounded + // up to the number of partitions. + // + regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); + + if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check + properties->regsPerBlock < regsAllocatedPerCTA || // Software check + attributes->numRegs > maxRegsPerThread) { // Per thread limit check + maxBlocks = 0; + } + else { + if (regsAllocatedPerWarp > 0) { + // Registers are allocated in each sub-partition. The max number + // of warps that can fit on an SM is equal to the max number of + // warps per sub-partition x number of sub-partitions. + // + numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions; + numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; + + maxBlocks = 0; + + if (*gcConfig != PARTITIONED_GC_OFF) { + int numSubPartitionsPerSmPartition; + int numWarpsPerSmPartition; + int maxBlocksPerSmPartition; + + // If partitioned global caching is on, then a CTA can only + // use a half SM, and thus a half of the registers available + // per SM + // + numSubPartitionsPerSmPartition = numSubPartitions / 2; + numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; + maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; + maxBlocks = maxBlocksPerSmPartition * 2; + } + + // Try again if partitioned global caching is not enabled, or if + // the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter + // case, the device will automatically turn off caching, except + // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate + // occupancy and launch configuration. + // + if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) { + // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since + // this is what it will be if we spread CTA across partitions. + // + *gcConfig = PARTITIONED_GC_OFF; + numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; + maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; + } + } + else { + maxBlocks = INT_MAX; + } + } + + + result->allocatedRegistersPerBlock = regsAllocatedPerCTA; + + *limit = maxBlocks; + + return status; +} + +// Barrier limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit( + int *limit, + int ctaLimitBlocks, + const cudaOccFuncAttributes *attributes) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int numBarriersAvailable = ctaLimitBlocks * 2; + int numBarriersUsed = attributes->numBlockBarriers; + int maxBlocks = INT_MAX; + + if (numBarriersUsed) { + maxBlocks = numBarriersAvailable / numBarriersUsed; + } + + *limit = maxBlocks; + + return status; +} + +/////////////////////////////////// +// API Implementations // +/////////////////////////////////// + +static __OCC_INLINE +cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int blockSize, + size_t dynamicSmemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int ctaLimitWarps = 0; + int ctaLimitBlocks = 0; + int ctaLimitSMem = 0; + int ctaLimitRegs = 0; + int ctaLimitBars = 0; + int ctaLimit = 0; + unsigned int limitingFactors = 0; + + cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF; + + if (!result || !properties || !attributes || !state || blockSize <= 0) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + /////////////////////////// + // Check user input + /////////////////////////// + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + /////////////////////////// + // Initialization + /////////////////////////// + + gcConfig = cudaOccPartitionedGCExpected(properties, attributes); + + /////////////////////////// + // Compute occupancy + /////////////////////////// + + // Limits due to registers/SM + // Also compute if partitioned global caching has to be turned off + // + status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4. + // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x. + // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0), + // we do not let it run on any Pascal processor, even though it may be able to run on GP100. + // Therefore, we check the occupancy on GP10x when it can run on GP100 + // + if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) { + cudaOccDeviceProp propertiesGP10x; + cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig; + int ctaLimitRegsGP10x = 0; + + // Set up properties for GP10x + memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x)); + propertiesGP10x.computeMinor = 1; + + status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + if (ctaLimitRegsGP10x == 0) { + ctaLimitRegs = 0; + } + } + + // Limits due to warps/SM + // + status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Limits due to blocks/SM + // + status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Limits due to shared memory/SM + // + status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + /////////////////////////// + // Overall occupancy + /////////////////////////// + + // Overall limit is min() of limits due to above reasons + // + ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); + + // Determine occupancy limiting factors + // + if (ctaLimit == ctaLimitWarps) { + limitingFactors |= OCC_LIMIT_WARPS; + } + if (ctaLimit == ctaLimitRegs) { + limitingFactors |= OCC_LIMIT_REGISTERS; + } + if (ctaLimit == ctaLimitSMem) { + limitingFactors |= OCC_LIMIT_SHARED_MEMORY; + } + if (ctaLimit == ctaLimitBlocks) { + limitingFactors |= OCC_LIMIT_BLOCKS; + } + + // For Hopper onwards compute the limits to occupancy based on block barrier count + // + if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) { + // Limits due to barrier/SM + // + status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Recompute overall limit based on barrier/SM + // + ctaLimit = __occMin(ctaLimitBars, ctaLimit); + + // Determine if this is occupancy limiting factor + // + if (ctaLimit == ctaLimitBars) { + limitingFactors |= OCC_LIMIT_BARRIERS; + } + } + else { + ctaLimitBars = INT_MAX; + } + + // Fill in the return values + // + result->limitingFactors = limitingFactors; + + result->blockLimitRegs = ctaLimitRegs; + result->blockLimitSharedMem = ctaLimitSMem; + result->blockLimitWarps = ctaLimitWarps; + result->blockLimitBlocks = ctaLimitBlocks; + result->blockLimitBarriers = ctaLimitBars; + result->partitionedGCConfig = gcConfig; + + // Final occupancy + result->activeBlocksPerMultiprocessor = ctaLimit; + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE +cudaOccError cudaOccAvailableDynamicSMemPerBlock( + size_t *bytesAvailable, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int numBlocks, + int blockSize) +{ + int allocationGranularity; + size_t smemLimitPerBlock; + size_t smemAvailableForDynamic; + size_t userSmemPreference = 0; + size_t sharedMemPerMultiprocessor; + cudaOccResult result; + cudaOccError status = CUDA_OCC_SUCCESS; + + if (numBlocks <= 0) + return CUDA_OCC_ERROR_INVALID_INPUT; + + // First compute occupancy of potential kernel launch. + // + status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + // Check if occupancy is achievable given user requested number of blocks. + // + if (result.activeBlocksPerMultiprocessor < numBlocks) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Return the per block shared memory limit based on function config. + // + status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW + // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user + // preference sets the total limit of available shared memory. + // + cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); + if (numBlocks == 1) { + sharedMemPerMultiprocessor = smemLimitPerBlock; + } + else { + if (!userSmemPreference) { + userSmemPreference = 1 ; + status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + } + sharedMemPerMultiprocessor = userSmemPreference; + } + + // Compute total shared memory available per SM + // + smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks; + smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity; + + // Cap shared memory + // + if (smemAvailableForDynamic > smemLimitPerBlock) { + smemAvailableForDynamic = smemLimitPerBlock; + } + + // Now compute dynamic shared memory size + smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; + + // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute() + // + if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes) + smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes; + + *bytesAvailable = smemAvailableForDynamic; + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + size_t (*blockSizeToDynamicSMemSize)(int), + size_t dynamicSMemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + cudaOccResult result; + + // Limits + int occupancyLimit; + int granularity; + int blockSizeLimit; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !properties || !attributes || !state) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = properties->maxThreadsPerMultiprocessor; + granularity = properties->warpSize; + + blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); + blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); + + // Ignore dynamicSMemSize if the user provides a mapping + // + if (blockSizeToDynamicSMemSize) { + dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry); + } + + status = cudaOccMaxActiveBlocksPerMultiprocessor( + &result, + properties, + attributes, + state, + blockSizeToTry, + dynamicSMemSize); + + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + occupancyInBlocks = result.activeBlocksPerMultiprocessor; + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * properties->numSms; + *blockSize = maxBlockSize; + + return status; +} + + +#if defined(__cplusplus) + +namespace { + +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + size_t dynamicSMemSize) +{ + return cudaOccMaxPotentialOccupancyBlockSize( + minGridSize, + blockSize, + properties, + attributes, + state, + NULL, + dynamicSMemSize); +} + +template +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + UnaryFunction blockSizeToDynamicSMemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + cudaOccResult result; + + // Limits + int occupancyLimit; + int granularity; + int blockSizeLimit; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !properties || !attributes || !state) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = properties->maxThreadsPerMultiprocessor; + granularity = properties->warpSize; + blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); + blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccMaxActiveBlocksPerMultiprocessor( + &result, + properties, + attributes, + state, + blockSizeToTry, + dynamicSMemSize); + + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + occupancyInBlocks = result.activeBlocksPerMultiprocessor; + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * properties->numSms; + *blockSize = maxBlockSize; + + return status; +} + +} // namespace anonymous + +#endif /*__cplusplus */ + +#undef __OCC_INLINE + +#endif /*__cuda_occupancy_h__*/ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h new file mode 100644 index 0000000000000000000000000000000000000000..46bc89e4499576f1ae58848cd8684ba3e32420cf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h @@ -0,0 +1,224 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_PIPELINE_H_ +# define _CUDA_PIPELINE_H_ + +# include "cuda_pipeline_primitives.h" + +# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER) +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +# endif + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +# include "cuda_awbarrier.h" +# endif + +// Integration with libcu++'s cuda::barrier. + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +# if defined(_LIBCUDACXX_CUDA_ABI_VERSION) +# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION +# else +# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4 +# endif + +# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y +# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y) +# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION) + +namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE { + struct __block_scope_barrier_base; +}} + +# endif + +_CUDA_PIPELINE_BEGIN_NAMESPACE + +template +_CUDA_PIPELINE_QUALIFIER +auto segment(T* ptr) -> T(*)[N]; + +class pipeline { +public: + pipeline(const pipeline&) = delete; + pipeline(pipeline&&) = delete; + pipeline& operator=(const pipeline&) = delete; + pipeline& operator=(pipeline&&) = delete; + + _CUDA_PIPELINE_QUALIFIER pipeline(); + _CUDA_PIPELINE_QUALIFIER size_t commit(); + _CUDA_PIPELINE_QUALIFIER void commit_and_wait(); + _CUDA_PIPELINE_QUALIFIER void wait(size_t batch); + template + _CUDA_PIPELINE_QUALIFIER void wait_prior(); + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) + _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier); + _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier); +# endif + +private: + size_t current_batch; +}; + +template +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T& dst, const T& src, pipeline& pipe); + +template +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe); + +template +_CUDA_PIPELINE_QUALIFIER +auto segment(T* ptr) -> T(*)[N] +{ + return (T(*)[N])ptr; +} + +_CUDA_PIPELINE_QUALIFIER +pipeline::pipeline() + : current_batch(0) +{ +} + +_CUDA_PIPELINE_QUALIFIER +size_t pipeline::commit() +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit(); + return this->current_batch++; +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::commit_and_wait() +{ + (void)pipeline::commit(); + pipeline::wait_prior<0>(); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::wait(size_t batch) +{ + const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0; + + switch (prior) { + case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break; + case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break; + case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break; + case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break; + case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break; + case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break; + case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break; + case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break; + default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break; + } +} + +template +_CUDA_PIPELINE_QUALIFIER +void pipeline::wait_prior() +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior(); +} + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +_CUDA_PIPELINE_QUALIFIER +void pipeline::arrive_on(awbarrier& barrier) +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier) +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast(&barrier)); +} +# endif + +template +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T& dst, const T& src, pipeline& pipe) +{ + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(&src) & (alignof(T) - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(&dst) & (alignof(T) - 1))); + + if (__is_trivially_copyable(T)) { + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed( + reinterpret_cast(&dst), reinterpret_cast(&src)); + } else { + dst = src; + } +} + +template +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe) +{ + constexpr size_t dst_size = sizeof(*dst); + constexpr size_t src_size = sizeof(*src); + static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size."); + static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size."); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (dst_size - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (dst_size - 1))); + + if (__is_trivially_copyable(T)) { + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict( + reinterpret_cast(*dst), reinterpret_cast(*src)); + } else { + for (size_t i = 0; i < DstN; ++i) { + (*dst)[i] = (i < SrcN) ? (*src)[i] : T(); + } + } +} + +_CUDA_PIPELINE_END_NAMESPACE + +#endif /* !_CUDA_PIPELINE_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..8d297bb6c4140f2056618cedd9b34bdc42cd6367 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h @@ -0,0 +1,2725 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_RUNTIME_H__) +#define __CUDA_RUNTIME_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic push +#endif +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4820) +#endif +#endif + +#ifdef __QNX__ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +typedef unsigned size_t; +#endif +#endif +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "crt/host_config.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "library_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#include "cuda_runtime_api.h" +#include "driver_functions.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "crt/host_defines.h" +#include "vector_functions.h" + +#if defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) +#include "nvrtc_device_runtime.h" +#include "crt/device_functions.h" +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "device_launch_parameters.h" + +#else /* !__CUDACC_RTC__ */ +#define EXCLUDE_FROM_RTC +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "crt/device_functions.h" +#include "device_launch_parameters.h" + +#if defined(__CUDACC_EXTENDED_LAMBDA__) +#include +#include +struct __device_builtin__ __nv_lambda_preheader_injection { }; +#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */ + +#undef EXCLUDE_FROM_RTC +#endif /* __CUDACC_RTC__ */ + +#endif /* __CUDACC__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) && !defined(__CUDACC_RTC__) + +#if __cplusplus >= 201103 +#include +#endif + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * @{ + */ + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + + +#if __cplusplus >= 201103 || defined(__DOXYGEN_ONLY__) +/** + * \brief Launches a CUDA function with launch-time configuration + * + * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x + * × \p config->gridDim.y × \p config->gridDim.z) grid of blocks. + * Each block contains \p config->blockDim (\p config->blockDim.x × + * \p config->blockDim.y × \p config->blockDim.z) threads. + * + * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that + * will be available to each thread block. + * + * \p config->stream specifies a stream the invocation is associated to. + * + * Configuration beyond grid and block dimensions, dynamic shared memory size, + * and stream can be provided with the following two fields of \p config: + * + * \p config->attrs is an array of \p config->numAttrs contiguous + * ::cudaLaunchAttribute elements. The value of this pointer is not considered + * if \p config->numAttrs is zero. However, in that case, it is recommended to + * set the pointer to NULL. + * \p config->numAttrs is the number of attributes populating the first + * \p config->numAttrs positions of the \p config->attrs array. + * + * The kernel arguments should be passed as arguments to this function via the + * \p args parameter pack. + * + * The C API version of this function, \p cudaLaunchKernelExC, is also available + * for pre-C++11 compilers and for use cases where the ability to pass kernel + * parameters via void* array is preferable. + * + * \param config - Launch configuration + * \param func - Kernel to launch + * \param args - Parameter pack of kernel parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)", + * ::cuLaunchKernelEx + */ +template +static __inline__ __host__ cudaError_t cudaLaunchKernelEx( + const cudaLaunchConfig_t *config, + void (*kernel)(ExpTypes...), + ActTypes &&... args +) +{ + return [&](ExpTypes... coercedArgs){ + void *pArgs[] = { &coercedArgs... }; + return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs); + }(std::forward(args)...); +} +# endif + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + * \brief \hl Creates an event object with the specified flags + * + * Creates an event object with the specified flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent + */ +static __inline__ __host__ cudaError_t cudaEventCreate( + cudaEvent_t *event, + unsigned int flags +) +{ + return ::cudaEventCreateWithFlags(event, flags); +} + +/** + * \brief \hl Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0. + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost + * flag in order for the ::cudaHostAllocMapped flag to have any effect. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param ptr - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc + */ +static __inline__ __host__ cudaError_t cudaMallocHost( + void **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc(ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostAlloc( + T **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostGetDevicePointer( + T **pDevice, + void *pHost, + unsigned int flags +) +{ + return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); +} + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specifed during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync + */ +template +static __inline__ __host__ cudaError_t cudaMallocManaged( + T **devPtr, + size_t size, + unsigned int flags = cudaMemAttachGlobal +) +{ + return ::cudaMallocManaged((void**)(void*)devPtr, size, flags); +} + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cudaMallocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::cudaDevAttrPageableMemoryAccess. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * memory) + * \param length - Length of memory (defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged + */ +template +static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync( + cudaStream_t stream, + T *devPtr, + size_t length = 0, + unsigned int flags = cudaMemAttachSingle +) +{ + return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMalloc( + T **devPtr, + size_t size +) +{ + return ::cudaMalloc((void**)(void*)devPtr, size); +} + +template +static __inline__ __host__ cudaError_t cudaMallocHost( + T **ptr, + size_t size, + unsigned int flags = 0 +) +{ + return cudaMallocHost((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMallocPitch( + T **devPtr, + size_t *pitch, + size_t width, + size_t height +) +{ + return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); +} + +/** + * \brief Allocate from a pool + * + * This is an alternate spelling for cudaMallocFromPoolAsync + * made available through operator overloading. + * + * \sa ::cudaMallocFromPoolAsync, + * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream) "cudaMallocAsync (C API)" + */ +static __inline__ __host__ cudaError_t cudaMallocAsync( + void **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream); +} + +template +static __inline__ __host__ cudaError_t cudaMallocAsync( + T **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream); +} + +template +static __inline__ __host__ cudaError_t cudaMallocAsync( + T **ptr, + size_t size, + cudaStream_t stream +) +{ + return ::cudaMallocAsync((void**)(void*)ptr, size, stream); +} + +template +static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync( + T **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream); +} + +#if defined(__CUDACC__) + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +) +{ + return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost +) +{ + return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream); +} + +/** + * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +template +static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol( + cudaGraphNode_t *pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t *pDependencies, + size_t numDependencies, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +template +static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol( + cudaGraphNode_t* pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t* pDependencies, + size_t numDependencies, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief Sets a memcpy node's parameters to copy to a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +template +static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol( + cudaGraphNode_t node, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Sets a memcpy node's parameters to copy from a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +template +static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol( + cudaGraphNode_t node, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p src and \p symbol must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphInstantiate, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams + */ +template +static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p symbol and \p dst must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphInstantiate, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsToSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams + */ +template +static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind); +} + +#if __cplusplus >= 201103 + +/** + * \brief Creates a user object by wrapping a C++ object + * + * TODO detail + * + * \param object_out - Location to return the user object handle + * \param objectToWrap - This becomes the \ptr argument to ::cudaUserObjectCreate. A + * lambda will be passed for the \p destroy argument, which calls + * delete on this object pointer. + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass cudaUserObjectNoDestructorSync, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate + */ +template +static __inline__ __host__ cudaError_t cudaUserObjectCreate( + cudaUserObject_t *object_out, + T *objectToWrap, + unsigned int initialRefcount, + unsigned int flags) +{ + return ::cudaUserObjectCreate( + object_out, + objectToWrap, + [](void *vpObj) { delete reinterpret_cast(vpObj); }, + initialRefcount, + flags); +} + +template +static __inline__ __host__ cudaError_t cudaUserObjectCreate( + cudaUserObject_t *object_out, + T *objectToWrap, + unsigned int initialRefcount, + cudaUserObjectFlags flags) +{ + return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags); +} + +#endif + +/** + * \brief \hl Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol can either be a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in the global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolAddress( + void **devPtr, + const T &symbol +) +{ + return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol); +} + +/** + * \brief \hl Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol must be a + * variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in global or constant memory space, \p *size is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolSize( + size_t *size, + const T &symbol +) +{ + return ::cudaGetSymbolSize(size, (const void*)&symbol); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. \p desc describes how the memory is interpreted when + * fetching values from the texture. The \p offset parameter is an optional + * byte offset as with the low-level + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t size = UINT_MAX +) +{ + return ::cudaBindTexture(offset, &tex, devPtr, &desc, size); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. The channel descriptor is inherited from the texture + * reference type. The \p offset parameter is an optional byte offset as with + * the low-level + * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t size = UINT_MAX +) +{ + return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. The channel descriptor is inherited from the texture reference + * type. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToArray(&tex, array, &desc); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err; +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc); +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray +) +{ + struct cudaChannelFormatDesc desc; + cudaArray_t levelArray; + cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0); + + if (err != cudaSuccess) { + return err; + } + err = ::cudaGetChannelDesc(&desc, levelArray); + + return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err; +} + +/** + * \brief \hl Unbinds a texture + * + * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed. + * + * \param tex - Texture to unbind + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaUnbindTexture( + const struct texture &tex +) +{ + return ::cudaUnbindTexture(&tex); +} + +/** + * \brief \hl Get the alignment offset of a texture + * + * Returns in \p *offset the offset that was returned when texture reference + * \p tex was bound. + * + * \param offset - Offset of texture reference in bytes + * \param tex - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( + size_t *offset, + const struct texture &tex +) +{ + return ::cudaGetTextureAlignmentOffset(offset, &tex); +} + +/** + * \brief \hl Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func must be a pointer to a function that executes on the device. + * The parameter specified by \p func must be declared as a \p __global__ + * function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param func - device function pointer + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig( + T *func, + enum cudaFuncCache cacheConfig +) +{ + return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig); +} + +template +static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig( + T *func, + enum cudaSharedMemConfig config +) +{ + return ::cudaFuncSetSharedMemConfig((const void*)func, config); +} + +#endif // __CUDACC__ + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); +} + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags); +} + +/** + * Helper functor for cudaOccupancyMaxPotentialBlockSize + */ +class __cudaOccupancyB2DHelper { + size_t n; +public: + inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} + inline __host__ CUDART_DEVICE size_t operator()(int) + { + return n; + } +}; + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + cudaError_t status; + + // Device and function properties + int device; + struct cudaFuncAttributes attr; + + // Limits + int maxThreadsPerMultiProcessor; + int warpSize; + int devMaxThreadsPerBlock; + int multiProcessorCount; + int funcMaxThreadsPerBlock; + int occupancyLimit; + int granularity; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !func) { + return cudaErrorInvalidValue; + } + + ////////////////////////////////////////////// + // Obtain device and function properties + ////////////////////////////////////////////// + + status = ::cudaGetDevice(&device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &maxThreadsPerMultiProcessor, + cudaDevAttrMaxThreadsPerMultiProcessor, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &warpSize, + cudaDevAttrWarpSize, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &devMaxThreadsPerBlock, + cudaDevAttrMaxThreadsPerBlock, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &multiProcessorCount, + cudaDevAttrMultiProcessorCount, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaFuncGetAttributes(&attr, func); + if (status != cudaSuccess) { + return status; + } + + funcMaxThreadsPerBlock = attr.maxThreadsPerBlock; + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = maxThreadsPerMultiProcessor; + granularity = warpSize; + + if (blockSizeLimit == 0) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (devMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (funcMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = funcMaxThreadsPerBlock; + } + + blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity; + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + // This is needed for the first iteration, because + // blockSizeLimitAligned could be greater than blockSizeLimit + // + if (blockSizeLimit < blockSizeToTryAligned) { + blockSizeToTry = blockSizeLimit; + } else { + blockSizeToTry = blockSizeToTryAligned; + } + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &occupancyInBlocks, + func, + blockSizeToTry, + dynamicSMemSize, + flags); + + if (status != cudaSuccess) { + return status; + } + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * multiProcessorCount; + *blockSize = maxBlockSize; + + return status; +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM. + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the block + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock( + size_t *dynamicSmemSize, + T func, + int numBlocks, + int blockSize) +{ + return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize); +} + +/** + * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handle. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSize + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags); +} + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum cluster size in \p *clusterSize. + * + * The cluster dimensions in \p config are ignored. If func has a required + * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect + * the required cluster size. + * + * By default this function will always return a value that's portable on + * future hardware. A higher value may be returned if the kernel function + * allows non-portable cluster sizes. + * + * This function will respect the compile time launch bounds. + * + * \param clusterSize - Returned maximum cluster size that can be launched + * for the given kernel function and launch configuration + * \param func - Kernel function for which maximum cluster + * size is calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaFuncGetAttributes + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize( + int *clusterSize, + T *func, + const cudaLaunchConfig_t *config) +{ + return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config); +} + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum number of clusters that could co-exist + * on the target device in \p *numClusters. + * + * If the function has required cluster size already set (see + * ::cudaFuncGetAttributes), the cluster size from config must either be + * unspecified or match the required size. + * Without required sizes, the cluster size must be specified in config, + * else the function will return an error. + * + * Note that various attributes of the kernel function may affect occupancy + * calculation. Runtime environment may affect how the hardware schedules + * the clusters, so the calculated occupancy is not guaranteed to be achievable. + * + * \param numClusters - Returned maximum number of clusters that + * could co-exist on the target device + * \param func - Kernel function for which maximum number + * of clusters are calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidClusterSize, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaFuncGetAttributes + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters( + int *numClusters, + T *func, + const cudaLaunchConfig_t *config) +{ + return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config); +} + +#if defined __CUDACC__ + +/** + * \brief \hl Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The fetched attributes are placed in \p attr. If the specified + * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param entry - Function to get attributes of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template +static __inline__ __host__ cudaError_t cudaFuncGetAttributes( + struct cudaFuncAttributes *attr, + T *entry +) +{ + return ::cudaFuncGetAttributes(attr, (const void*)entry); +} + +/** + * \brief \hl Set attributes for a given function + * + * This function sets the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes + * cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture. + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block + * scheduling policy of a function. The value type is cudaClusterSchedulingPolicy. + * + * \param entry - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetAttribute( + T *entry, + enum cudaFuncAttribute attr, + int value +) +{ + return ::cudaFuncSetAttribute((const void*)entry, attr, value); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * \p desc describes how the memory is interpreted when dealing with + * the surface. Any CUDA array previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindSurfaceToArray(&surf, array, &desc); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)" + */ +template +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err; +} + +#endif /* __CUDACC__ */ + +/** @} */ /* END CUDART_HIGHLEVEL */ + +#endif /* __cplusplus && !__CUDACC_RTC__ */ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic pop +#endif +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif +#endif + +#undef __CUDA_DEPRECATED + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#endif /* !__CUDA_RUNTIME_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h new file mode 100644 index 0000000000000000000000000000000000000000..21c8d944930456a914ba86e0972dbf6ba1bc5bba --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h @@ -0,0 +1,109 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_TEXTURE_TYPES_H__) +#define __CUDA_TEXTURE_TYPES_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +template +struct __device_builtin_texture_type__ texture : public textureReference +{ +#if !defined(__CUDACC_RTC__) + __host__ texture(int norm = 0, + enum cudaTextureFilterMode fMode = cudaFilterModePoint, + enum cudaTextureAddressMode aMode = cudaAddressModeClamp) + { + normalized = norm; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = cudaCreateChannelDesc(); + sRGB = 0; + } + + __host__ texture(int norm, + enum cudaTextureFilterMode fMode, + enum cudaTextureAddressMode aMode, + struct cudaChannelFormatDesc desc) + { + normalized = norm; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = desc; + sRGB = 0; + } +#endif /* !__CUDACC_RTC__ */ +}; + +#endif /* __cplusplus && __CUDACC__ */ + +#endif /* !__CUDA_TEXTURE_TYPES_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50e427c39b164e6e145c3930cd66cc03806175a6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp @@ -0,0 +1,224 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__) +#define __DEVICE_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) +{ + return __iAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) +{ + return __iAtomicAdd(address, (unsigned int)-(int)val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd(address, (unsigned int)-(int)val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) +{ + return __iAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) +{ + return __uAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) +{ + return __fAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) +{ + return __iAtomicMin(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) +{ + return __uAtomicMin(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) +{ + return __iAtomicMax(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) +{ + return __uAtomicMax(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) +{ + return __uAtomicInc(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) +{ + return __uAtomicDec(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) +{ + return __iAtomicAnd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) +{ + return __iAtomicOr(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) +{ + return __uAtomicOr(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) +{ + return __iAtomicXor(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) +{ + return __uAtomicXor(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) +{ + return __iAtomicCAS(address, compare, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) +{ + return __uAtomicCAS(address, compare, val); +} + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) +{ + return __ullAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) +{ + return __ullAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) +{ + return __ullAtomicCAS(address, compare, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond) +{ + return (bool)__any((int)cond); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond) +{ + return (bool)__all((int)cond); +} + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/device_double_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h new file mode 100644 index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h @@ -0,0 +1,118 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__) +#define __DEVICE_LAUNCH_PARAMETERS_H__ + +#include "vector_types.h" + +#if !defined(__STORAGE__) + +#if defined(__CUDACC_RTC__) +#define __STORAGE__ \ + extern const __device__ +#else /* !__CUDACC_RTC__ */ +#define __STORAGE__ \ + extern const +#endif /* __CUDACC_RTC__ */ + +#endif /* __STORAGE__ */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +uint3 __device_builtin__ __STORAGE__ threadIdx; +uint3 __device_builtin__ __STORAGE__ blockIdx; +dim3 __device_builtin__ __STORAGE__ blockDim; +dim3 __device_builtin__ __STORAGE__ gridDim; +int __device_builtin__ __STORAGE__ warpSize; + +#undef __STORAGE__ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#if !defined(__cudaGet_threadIdx) + +#define __cudaGet_threadIdx() \ + threadIdx + +#endif /* __cudaGet_threadIdx */ + +#if !defined(__cudaGet_blockIdx) + +#define __cudaGet_blockIdx() \ + blockIdx + +#endif /* __cudaGet_blockIdx */ + +#if !defined(__cudaGet_blockDim) + +#define __cudaGet_blockDim() \ + blockDim + +#endif /* __cudaGet_blockDim */ + +#if !defined(__cudaGet_gridDim) + +#define __cudaGet_gridDim() \ + gridDim + +#endif /* __cudaGet_gridDim */ + +#if !defined(__cudaGet_warpSize) + +#define __cudaGet_warpSize() \ + warpSize + +#endif /* __cudaGet_warpSize */ + +#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h @@ -0,0 +1,145 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_FUNCTIONS_H__) +#define __DRIVER_FUNCTIONS_H__ + +#include "builtin_types.h" +#include "crt/host_defines.h" +#include "driver_types.h" + +/** + * \addtogroup CUDART_MEMORY + * + * @{ + */ + +/** + * \brief Returns a cudaPitchedPtr based on input parameters + * + * Returns a ::cudaPitchedPtr based on the specified input parameters \p d, + * \p p, \p xsz, and \p ysz. + * + * \param d - Pointer to allocated memory + * \param p - Pitch of allocated memory in bytes + * \param xsz - Logical width of allocation in elements + * \param ysz - Logical height of allocation in elements + * + * \return + * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz + * + * \sa make_cudaExtent, make_cudaPos + */ +static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) +{ + struct cudaPitchedPtr s; + + s.ptr = d; + s.pitch = p; + s.xsize = xsz; + s.ysize = ysz; + + return s; +} + +/** + * \brief Returns a cudaPos based on input parameters + * + * Returns a ::cudaPos based on the specified input parameters \p x, + * \p y, and \p z. + * + * \param x - X position + * \param y - Y position + * \param z - Z position + * + * \return + * ::cudaPos specified by \p x, \p y, and \p z + * + * \sa make_cudaExtent, make_cudaPitchedPtr + */ +static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) +{ + struct cudaPos p; + + p.x = x; + p.y = y; + p.z = z; + + return p; +} + +/** + * \brief Returns a cudaExtent based on input parameters + * + * Returns a ::cudaExtent based on the specified input parameters \p w, + * \p h, and \p d. + * + * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory + * \param h - Height in elements + * \param d - Depth in elements + * + * \return + * ::cudaExtent specified by \p w, \p h, and \p d + * + * \sa make_cudaPitchedPtr, make_cudaPos + */ +static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) +{ + struct cudaExtent e; + + e.width = w; + e.height = h; + e.depth = d; + + return e; +} + +/** @} */ /* END CUDART_MEMORY */ + +#endif /* !__DRIVER_FUNCTIONS_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h new file mode 100644 index 0000000000000000000000000000000000000000..47b54f94d6a1dcd278ddda0dd0fa5a4ca866a5ff --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h @@ -0,0 +1,3093 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_TYPES_H__) +#define __DRIVER_TYPES_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__ +#endif + +#ifndef __DOXYGEN_ONLY__ +#include "crt/host_defines.h" +#endif +#include "vector_types.h" + + + +/** + * \defgroup CUDART_TYPES Data types used by CUDA Runtime + * \ingroup CUDART + * + * @{ + */ + +/******************************************************************************* +* * +* TYPE DEFINITIONS USED BY RUNTIME API * +* * +*******************************************************************************/ + +#if !defined(__CUDA_INTERNAL_COMPILATION__) + +#if !defined(__CUDACC_RTC__) +#include +#include +#endif /* !defined(__CUDACC_RTC__) */ + +#define cudaHostAllocDefault 0x00 /**< Default page-locked allocation flag */ +#define cudaHostAllocPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostAllocMapped 0x02 /**< Map allocation into device space */ +#define cudaHostAllocWriteCombined 0x04 /**< Write-combined memory */ + +#define cudaHostRegisterDefault 0x00 /**< Default host memory registration flag */ +#define cudaHostRegisterPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostRegisterMapped 0x02 /**< Map registered memory into device space */ +#define cudaHostRegisterIoMemory 0x04 /**< Memory-mapped I/O space */ +#define cudaHostRegisterReadOnly 0x08 /**< Memory-mapped read-only */ + +#define cudaPeerAccessDefault 0x00 /**< Default peer addressing enable flag */ + +#define cudaStreamDefault 0x00 /**< Default stream flag */ +#define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ + + /** + * Legacy stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamLegacy ((cudaStream_t)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamPerThread ((cudaStream_t)0x2) + +#define cudaEventDefault 0x00 /**< Default event flag */ +#define cudaEventBlockingSync 0x01 /**< Event uses blocking synchronization */ +#define cudaEventDisableTiming 0x02 /**< Event will not record timing data */ +#define cudaEventInterprocess 0x04 /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */ + +#define cudaEventRecordDefault 0x00 /**< Default event record flag */ +#define cudaEventRecordExternal 0x01 /**< Event is captured in the graph as an external event node when performing stream capture */ + +#define cudaEventWaitDefault 0x00 /**< Default event wait flag */ +#define cudaEventWaitExternal 0x01 /**< Event is captured in the graph as an external event node when performing stream capture */ + +#define cudaDeviceScheduleAuto 0x00 /**< Device flag - Automatic scheduling */ +#define cudaDeviceScheduleSpin 0x01 /**< Device flag - Spin default scheduling */ +#define cudaDeviceScheduleYield 0x02 /**< Device flag - Yield default scheduling */ +#define cudaDeviceScheduleBlockingSync 0x04 /**< Device flag - Use blocking synchronization */ +#define cudaDeviceBlockingSync 0x04 /**< Device flag - Use blocking synchronization + * \deprecated This flag was deprecated as of CUDA 4.0 and + * replaced with ::cudaDeviceScheduleBlockingSync. */ +#define cudaDeviceScheduleMask 0x07 /**< Device schedule flags mask */ +#define cudaDeviceMapHost 0x08 /**< Device flag - Support mapped pinned allocations */ +#define cudaDeviceLmemResizeToMax 0x10 /**< Device flag - Keep local memory allocation after launch */ +#define cudaDeviceMask 0x1f /**< Device flags mask */ + +#define cudaArrayDefault 0x00 /**< Default CUDA array allocation flag */ +#define cudaArrayLayered 0x01 /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */ +#define cudaArraySurfaceLoadStore 0x02 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */ +#define cudaArrayCubemap 0x04 /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */ +#define cudaArrayTextureGather 0x08 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */ +#define cudaArrayColorAttachment 0x20 /**< Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API */ +#define cudaArraySparse 0x40 /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array */ +#define cudaArrayDeferredMapping 0x80 /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array */ + +#define cudaIpcMemLazyEnablePeerAccess 0x01 /**< Automatically enable peer access between remote devices as needed */ + +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ +#define cudaMemAttachHost 0x02 /**< Memory cannot be accessed by any stream on any device */ +#define cudaMemAttachSingle 0x04 /**< Memory can only be accessed by a single stream on the associated device */ + +#define cudaOccupancyDefault 0x00 /**< Default behavior */ +#define cudaOccupancyDisableCachingOverride 0x01 /**< Assume global caching is enabled and cannot be automatically turned off */ + +#define cudaCpuDeviceId ((int)-1) /**< Device id that represents the CPU */ +#define cudaInvalidDeviceId ((int)-2) /**< Device id that represents an invalid device */ + +/** + * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPreSync 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02 + +#endif /* !__CUDA_INTERNAL_COMPILATION__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * CUDA error types + */ +enum __device_builtin__ cudaError +{ + /** + * The API call returned with no errors. In the case of query calls, this + * also means that the operation being queried is complete (see + * ::cudaEventQuery() and ::cudaStreamQuery()). + */ + cudaSuccess = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + cudaErrorInvalidValue = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + cudaErrorMemoryAllocation = 2, + + /** + * The API call failed because the CUDA driver and runtime could not be + * initialized. + */ + cudaErrorInitializationError = 3, + + /** + * This indicates that a CUDA Runtime API call cannot be executed because + * it is being called during process shut down, at a point in time after + * CUDA driver has been unloaded. + */ + cudaErrorCudartUnloading = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + cudaErrorProfilerDisabled = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cudaProfilerStart or + * ::cudaProfilerStop without initialization. + */ + cudaErrorProfilerNotInitialized = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStart() when profiling is already enabled. + */ + cudaErrorProfilerAlreadyStarted = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStop() when profiling is already disabled. + */ + cudaErrorProfilerAlreadyStopped = 8, + + /** + * This indicates that a kernel launch is requesting resources that can + * never be satisfied by the current device. Requesting more shared memory + * per block than the device supports will trigger this error, as will + * requesting too many threads or blocks. See ::cudaDeviceProp for more + * device limitations. + */ + cudaErrorInvalidConfiguration = 9, + + /** + * This indicates that one or more of the pitch-related parameters passed + * to the API call is not within the acceptable range for pitch. + */ + cudaErrorInvalidPitchValue = 12, + + /** + * This indicates that the symbol name/identifier passed to the API call + * is not a valid name or identifier. + */ + cudaErrorInvalidSymbol = 13, + + /** + * This indicates that at least one host pointer passed to the API call is + * not a valid host pointer. + * \deprecated + * This error return is deprecated as of CUDA 10.1. + */ + cudaErrorInvalidHostPointer = 16, + + /** + * This indicates that at least one device pointer passed to the API call is + * not a valid device pointer. + * \deprecated + * This error return is deprecated as of CUDA 10.1. + */ + cudaErrorInvalidDevicePointer = 17, + + /** + * This indicates that the texture passed to the API call is not a valid + * texture. + */ + cudaErrorInvalidTexture = 18, + + /** + * This indicates that the texture binding is not valid. This occurs if you + * call ::cudaGetTextureAlignmentOffset() with an unbound texture. + */ + cudaErrorInvalidTextureBinding = 19, + + /** + * This indicates that the channel descriptor passed to the API call is not + * valid. This occurs if the format is not one of the formats specified by + * ::cudaChannelFormatKind, or if one of the dimensions is invalid. + */ + cudaErrorInvalidChannelDescriptor = 20, + + /** + * This indicates that the direction of the memcpy passed to the API call is + * not one of the types specified by ::cudaMemcpyKind. + */ + cudaErrorInvalidMemcpyDirection = 21, + + /** + * This indicated that the user has taken the address of a constant variable, + * which was forbidden up until the CUDA 3.1 release. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Variables in constant + * memory may now have their address taken by the runtime via + * ::cudaGetSymbolAddress(). + */ + cudaErrorAddressOfConstant = 22, + + /** + * This indicated that a texture fetch was not able to be performed. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureFetchFailed = 23, + + /** + * This indicated that a texture was not bound for access. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureNotBound = 24, + + /** + * This indicated that a synchronization operation had failed. + * This was previously used for some device emulation functions. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorSynchronizationError = 25, + + /** + * This indicates that a non-float texture was being accessed with linear + * filtering. This is not supported by CUDA. + */ + cudaErrorInvalidFilterSetting = 26, + + /** + * This indicates that an attempt was made to read a non-float texture as a + * normalized float. This is not supported by CUDA. + */ + cudaErrorInvalidNormSetting = 27, + + /** + * Mixing of device and device emulation code was not allowed. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMixedDeviceExecution = 28, + + /** + * This indicates that the API call is not yet implemented. Production + * releases of CUDA will never return this error. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorNotYetImplemented = 31, + + /** + * This indicated that an emulated device pointer exceeded the 32-bit address + * range. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMemoryValueTooLarge = 32, + + /** + * This indicates that the CUDA driver that the application has loaded is a + * stub library. Applications that run with the stub rather than a real + * driver loaded will result in CUDA API returning this error. + */ + cudaErrorStubLibrary = 34, + + /** + * This indicates that the installed NVIDIA CUDA driver is older than the + * CUDA runtime library. This is not a supported configuration. Users should + * install an updated NVIDIA display driver to allow the application to run. + */ + cudaErrorInsufficientDriver = 35, + + /** + * This indicates that the API call requires a newer CUDA driver than the one + * currently installed. Users should install an updated NVIDIA CUDA driver + * to allow the API call to succeed. + */ + cudaErrorCallRequiresNewerDriver = 36, + + /** + * This indicates that the surface passed to the API call is not a valid + * surface. + */ + cudaErrorInvalidSurface = 37, + + /** + * This indicates that multiple global or constant variables (across separate + * CUDA source files in the application) share the same string name. + */ + cudaErrorDuplicateVariableName = 43, + + /** + * This indicates that multiple textures (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateTextureName = 44, + + /** + * This indicates that multiple surfaces (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateSurfaceName = 45, + + /** + * This indicates that all CUDA devices are busy or unavailable at the current + * time. Devices are often busy/unavailable due to use of + * ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long + * running CUDA kernels have filled up the GPU and are blocking new work + * from starting. They can also be unavailable due to memory constraints + * on a device that already has active CUDA work being performed. + */ + cudaErrorDevicesUnavailable = 46, + + /** + * This indicates that the current context is not compatible with this + * the CUDA Runtime. This can only occur if you are using CUDA + * Runtime/Driver interoperability and have created an existing Driver + * context using the driver API. The Driver context may be incompatible + * either because the Driver context was created using an older version + * of the API, because the Runtime API call expects a primary driver + * context and the Driver context is not primary, or because the Driver + * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions + * with the CUDA Driver API" for more information. + */ + cudaErrorIncompatibleDriverContext = 49, + + /** + * The device function being invoked (usually via ::cudaLaunchKernel()) was not + * previously configured via the ::cudaConfigureCall() function. + */ + cudaErrorMissingConfiguration = 52, + + /** + * This indicated that a previous kernel launch failed. This was previously + * used for device emulation of kernel launches. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorPriorLaunchFailure = 53, + + /** + * This error indicates that a device runtime grid launch did not occur + * because the depth of the child grid would exceed the maximum supported + * number of nested grid launches. + */ + cudaErrorLaunchMaxDepthExceeded = 65, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped textures which are unsupported by the device runtime. + * Kernels launched via the device runtime only support textures created with + * the Texture Object API's. + */ + cudaErrorLaunchFileScopedTex = 66, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped surfaces which are unsupported by the device runtime. + * Kernels launched via the device runtime only support surfaces created with + * the Surface Object API's. + */ + cudaErrorLaunchFileScopedSurf = 67, + + /** + * This error indicates that a call to ::cudaDeviceSynchronize made from + * the device runtime failed because the call was made at grid depth greater + * than than either the default (2 levels of grids) or user specified device + * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on + * launched grids at a greater depth successfully, the maximum nested + * depth at which ::cudaDeviceSynchronize will be called must be specified + * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit + * api before the host-side launch of a kernel using the device runtime. + * Keep in mind that additional levels of sync depth require the runtime + * to reserve large amounts of device memory that cannot be used for + * user allocations. + */ + cudaErrorSyncDepthExceeded = 68, + + /** + * This error indicates that a device runtime grid launch failed because + * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount. + * For this launch to proceed successfully, ::cudaDeviceSetLimit must be + * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher + * than the upper bound of outstanding launches that can be issued to the + * device runtime. Keep in mind that raising the limit of pending device + * runtime launches will require the runtime to reserve device memory that + * cannot be used for user allocations. + */ + cudaErrorLaunchPendingCountExceeded = 69, + + /** + * The requested device function does not exist or is not compiled for the + * proper device architecture. + */ + cudaErrorInvalidDeviceFunction = 98, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + cudaErrorNoDevice = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device or that the action requested is + * invalid for the specified device. + */ + cudaErrorInvalidDevice = 101, + + /** + * This indicates that the device doesn't have a valid Grid License. + */ + cudaErrorDeviceNotLicensed = 102, + + /** + * By default, the CUDA runtime may perform a minimal set of self-tests, + * as well as CUDA driver tests, to establish the validity of both. + * Introduced in CUDA 11.2, this error return indicates that at least one + * of these tests has failed and the validity of either the runtime + * or the driver could not be established. + */ + cudaErrorSoftwareValidityNotEstablished = 103, + + /** + * This indicates an internal startup failure in the CUDA runtime. + */ + cudaErrorStartupFailure = 127, + + /** + * This indicates that the device kernel image is invalid. + */ + cudaErrorInvalidKernelImage = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + cudaErrorDeviceUninitialized = 201, + + /** + * This indicates that the buffer object could not be mapped. + */ + cudaErrorMapBufferObjectFailed = 205, + + /** + * This indicates that the buffer object could not be unmapped. + */ + cudaErrorUnmapBufferObjectFailed = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + cudaErrorArrayIsMapped = 207, + + /** + * This indicates that the resource is already mapped. + */ + cudaErrorAlreadyMapped = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + cudaErrorNoKernelImageForDevice = 209, + + /** + * This indicates that a resource has already been acquired. + */ + cudaErrorAlreadyAcquired = 210, + + /** + * This indicates that a resource is not mapped. + */ + cudaErrorNotMapped = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + cudaErrorNotMappedAsArray = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + cudaErrorNotMappedAsPointer = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + cudaErrorECCUncorrectable = 214, + + /** + * This indicates that the ::cudaLimit passed to the API call is not + * supported by the active device. + */ + cudaErrorUnsupportedLimit = 215, + + /** + * This indicates that a call tried to access an exclusive-thread device that + * is already in use by a different thread. + */ + cudaErrorDeviceAlreadyInUse = 216, + + /** + * This error indicates that P2P access is not supported across the given + * devices. + */ + cudaErrorPeerAccessUnsupported = 217, + + /** + * A PTX compilation failed. The runtime may fall back to compiling PTX if + * an application does not contain a suitable binary for the current device. + */ + cudaErrorInvalidPtx = 218, + + /** + * This indicates an error with the OpenGL or DirectX context. + */ + cudaErrorInvalidGraphicsContext = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + cudaErrorNvlinkUncorrectable = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. The JIT Compiler + * library is used for PTX compilation. The runtime may fall back to compiling PTX + * if an application does not contain a suitable binary for the current device. + */ + cudaErrorJitCompilerNotFound = 221, + + /** + * This indicates that the provided PTX was compiled with an unsupported toolchain. + * The most common reason for this, is the PTX was generated by a compiler newer + * than what is supported by the CUDA driver and PTX JIT compiler. + */ + cudaErrorUnsupportedPtxVersion = 222, + + /** + * This indicates that the JIT compilation was disabled. The JIT compilation compiles + * PTX. The runtime may fall back to compiling PTX if an application does not contain + * a suitable binary for the current device. + */ + cudaErrorJitCompilationDisabled = 223, + + /** + * This indicates that the provided execution affinity is not supported by the device. + */ + cudaErrorUnsupportedExecAffinity = 224, + + /** + * This indicates that the device kernel source is invalid. + */ + cudaErrorInvalidSource = 300, + + /** + * This indicates that the file specified was not found. + */ + cudaErrorFileNotFound = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + cudaErrorSharedObjectSymbolNotFound = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + cudaErrorSharedObjectInitFailed = 303, + + /** + * This error indicates that an OS call failed. + */ + cudaErrorOperatingSystem = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::cudaStream_t and + * ::cudaEvent_t. + */ + cudaErrorInvalidResourceHandle = 400, + + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + cudaErrorIllegalState = 401, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, driver function names, texture names, + * and surface names. + */ + cudaErrorSymbolNotFound = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::cudaSuccess (which indicates completion). Calls that + * may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). + */ + cudaErrorNotReady = 600, + + /** + * The device encountered a load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalAddress = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. Although this error is similar to + * ::cudaErrorInvalidConfiguration, this error usually indicates that the + * user has attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register count. + */ + cudaErrorLaunchOutOfResources = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device property + * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + * for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorLaunchTimeout = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + cudaErrorLaunchIncompatibleTexturing = 703, + + /** + * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is + * trying to re-enable peer addressing on from a context which has already + * had peer addressing enabled. + */ + cudaErrorPeerAccessAlreadyEnabled = 704, + + /** + * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to + * disable peer addressing which has not been enabled yet via + * ::cudaDeviceEnablePeerAccess(). + */ + cudaErrorPeerAccessNotEnabled = 705, + + /** + * This indicates that the user has called ::cudaSetValidDevices(), + * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), + * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or + * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by + * calling non-device management operations (allocating memory and + * launching kernels are examples of non-device management operations). + * This error can also be returned if using runtime/driver + * interoperability and there is an existing ::CUcontext active on the + * host thread. + */ + cudaErrorSetOnActiveProcess = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + cudaErrorContextIsDestroyed = 709, + + /** + * An assert triggered in device code during kernel execution. The device + * cannot be used again. All existing allocations are invalid. To continue + * using CUDA, the process must be terminated and relaunched. + */ + cudaErrorAssert = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cudaEnablePeerAccess(). + */ + cudaErrorTooManyPeers = 711, + + /** + * This error indicates that the memory range passed to ::cudaHostRegister() + * has already been registered. + */ + cudaErrorHostMemoryAlreadyRegistered = 712, + + /** + * This error indicates that the pointer passed to ::cudaHostUnregister() + * does not correspond to any currently registered memory region. + */ + cudaErrorHostMemoryNotRegistered = 713, + + /** + * Device encountered an error in the call stack during kernel execution, + * possibly due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorHardwareStackError = 714, + + /** + * The device encountered an illegal instruction during kernel execution + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalInstruction = 715, + + /** + * The device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorMisalignedAddress = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidAddressSpace = 717, + + /** + * The device encountered an invalid program counter. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidPc = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorLaunchFailure = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + */ + cudaErrorCooperativeLaunchTooLarge = 720, + + /** + * This error indicates the attempted operation is not permitted. + */ + cudaErrorNotPermitted = 800, + + /** + * This error indicates the attempted operation is not supported + * on the current system or device. + */ + cudaErrorNotSupported = 801, + + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. + */ + cudaErrorSystemNotReady = 802, + + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + cudaErrorSystemDriverMismatch = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + cudaErrorCompatNotSupportedOnDevice = 804, + + /** + * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + */ + cudaErrorMpsConnectionFailed = 805, + + /** + * This error indicates that the remote procedural call between the MPS server and the MPS client failed. + */ + cudaErrorMpsRpcFailure = 806, + + /** + * This error indicates that the MPS server is not ready to accept new MPS client requests. + * This error can be returned when the MPS server is in the process of recovering from a fatal failure. + */ + cudaErrorMpsServerNotReady = 807, + + /** + * This error indicates that the hardware resources required to create MPS client have been exhausted. + */ + cudaErrorMpsMaxClientsReached = 808, + + /** + * This error indicates the the hardware resources required to device connections have been exhausted. + */ + cudaErrorMpsMaxConnectionsReached = 809, + + /** + * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. + */ + cudaErrorMpsClientTerminated = 810, + + /** + * The operation is not permitted when the stream is capturing. + */ + cudaErrorStreamCaptureUnsupported = 900, + + /** + * The current capture sequence on the stream has been invalidated due to + * a previous error. + */ + cudaErrorStreamCaptureInvalidated = 901, + + /** + * The operation would have resulted in a merge of two independent capture + * sequences. + */ + cudaErrorStreamCaptureMerge = 902, + + /** + * The capture was not initiated in this stream. + */ + cudaErrorStreamCaptureUnmatched = 903, + + /** + * The capture sequence contains a fork that was not joined to the primary + * stream. + */ + cudaErrorStreamCaptureUnjoined = 904, + + /** + * A dependency would have been created which crosses the capture sequence + * boundary. Only implicit in-stream ordering dependencies are allowed to + * cross the boundary. + */ + cudaErrorStreamCaptureIsolation = 905, + + /** + * The operation would have resulted in a disallowed implicit dependency on + * a current capture sequence from cudaStreamLegacy. + */ + cudaErrorStreamCaptureImplicit = 906, + + /** + * The operation is not permitted on an event which was last recorded in a + * capturing stream. + */ + cudaErrorCapturedEvent = 907, + + /** + * A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed + * argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a + * different thread. + */ + cudaErrorStreamCaptureWrongThread = 908, + + /** + * This indicates that the wait operation has timed out. + */ + cudaErrorTimeout = 909, + + /** + * This error indicates that the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + */ + cudaErrorGraphExecUpdateFailure = 910, + + /** + * This indicates that an async error has occurred in a device outside of CUDA. + * If CUDA was waiting for an external device's signal before consuming shared data, + * the external device signaled an error indicating that the data is not valid for + * consumption. This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must be + * terminated and relaunched. + */ + cudaErrorExternalDevice = 911, + + /** + * This indicates that a kernel launch error has occurred due to cluster + * misconfiguration. + */ + cudaErrorInvalidClusterSize = 912, + + /** + * This indicates that an unknown internal error has occurred. + */ + cudaErrorUnknown = 999, + + /** + * Any unhandled CUDA driver error is added to this value and returned via + * the runtime. Production releases of CUDA should not return such errors. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorApiFailureBase = 10000 +}; + +/** + * Channel format kind + */ +enum __device_builtin__ cudaChannelFormatKind +{ + cudaChannelFormatKindSigned = 0, /**< Signed channel format */ + cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ + cudaChannelFormatKindFloat = 2, /**< Float channel format */ + cudaChannelFormatKindNone = 3, /**< No channel format */ + cudaChannelFormatKindNV12 = 4, /**< Unsigned 8-bit integers, planar 4:2:0 YUV format */ + cudaChannelFormatKindUnsignedNormalized8X1 = 5, /**< 1 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized8X2 = 6, /**< 2 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized8X4 = 7, /**< 4 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X1 = 8, /**< 1 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X2 = 9, /**< 2 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X4 = 10, /**< 4 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X1 = 11, /**< 1 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X2 = 12, /**< 2 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X4 = 13, /**< 4 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X1 = 14, /**< 1 channel signed 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X2 = 15, /**< 2 channel signed 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X4 = 16, /**< 4 channel signed 16-bit normalized integer */ + cudaChannelFormatKindUnsignedBlockCompressed1 = 17, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed1SRGB = 18, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ + cudaChannelFormatKindUnsignedBlockCompressed2 = 19, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed2SRGB = 20, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding */ + cudaChannelFormatKindUnsignedBlockCompressed3 = 21, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed3SRGB = 22, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding */ + cudaChannelFormatKindUnsignedBlockCompressed4 = 23, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ + cudaChannelFormatKindSignedBlockCompressed4 = 24, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed5 = 25, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ + cudaChannelFormatKindSignedBlockCompressed5 = 26, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed6H = 27, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ + cudaChannelFormatKindSignedBlockCompressed6H = 28, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed7 = 29, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed7SRGB = 30 /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ +}; + +/** + * CUDA Channel format descriptor + */ +struct __device_builtin__ cudaChannelFormatDesc +{ + int x; /**< x */ + int y; /**< y */ + int z; /**< z */ + int w; /**< w */ + enum cudaChannelFormatKind f; /**< Channel format kind */ +}; + +/** + * CUDA array + */ +typedef struct cudaArray *cudaArray_t; + +/** + * CUDA array (as source copy argument) + */ +typedef const struct cudaArray *cudaArray_const_t; + +struct cudaArray; + +/** + * CUDA mipmapped array + */ +typedef struct cudaMipmappedArray *cudaMipmappedArray_t; + +/** + * CUDA mipmapped array (as source argument) + */ +typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t; + +struct cudaMipmappedArray; + +/** + * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers + */ +#define cudaArraySparsePropertiesSingleMipTail 0x1 + +/** + * Sparse CUDA array and CUDA mipmapped array properties + */ +struct __device_builtin__ cudaArraySparseProperties { + struct { + unsigned int width; /**< Tile width in elements */ + unsigned int height; /**< Tile height in elements */ + unsigned int depth; /**< Tile depth in elements */ + } tileExtent; + unsigned int miptailFirstLevel; /**< First mip level at which the mip tail begins */ + unsigned long long miptailSize; /**< Total size of the mip tail. */ + unsigned int flags; /**< Flags will either be zero or ::cudaArraySparsePropertiesSingleMipTail */ + unsigned int reserved[4]; +}; + +/** + * CUDA array and CUDA mipmapped array memory requirements + */ +struct __device_builtin__ cudaArrayMemoryRequirements { + size_t size; /**< Total size of the array. */ + size_t alignment; /**< Alignment necessary for mapping the array. */ + unsigned int reserved[4]; +}; + +/** + * CUDA memory types + */ +enum __device_builtin__ cudaMemoryType +{ + cudaMemoryTypeUnregistered = 0, /**< Unregistered memory */ + cudaMemoryTypeHost = 1, /**< Host memory */ + cudaMemoryTypeDevice = 2, /**< Device memory */ + cudaMemoryTypeManaged = 3 /**< Managed memory */ +}; + +/** + * CUDA memory copy types + */ +enum __device_builtin__ cudaMemcpyKind +{ + cudaMemcpyHostToHost = 0, /**< Host -> Host */ + cudaMemcpyHostToDevice = 1, /**< Host -> Device */ + cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ + cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ + cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ +}; + +/** + * CUDA Pitched memory pointer + * + * \sa ::make_cudaPitchedPtr + */ +struct __device_builtin__ cudaPitchedPtr +{ + void *ptr; /**< Pointer to allocated memory */ + size_t pitch; /**< Pitch of allocated memory in bytes */ + size_t xsize; /**< Logical width of allocation in elements */ + size_t ysize; /**< Logical height of allocation in elements */ +}; + +/** + * CUDA extent + * + * \sa ::make_cudaExtent + */ +struct __device_builtin__ cudaExtent +{ + size_t width; /**< Width in elements when referring to array memory, in bytes when referring to linear memory */ + size_t height; /**< Height in elements */ + size_t depth; /**< Depth in elements */ +}; + +/** + * CUDA 3D position + * + * \sa ::make_cudaPos + */ +struct __device_builtin__ cudaPos +{ + size_t x; /**< x */ + size_t y; /**< y */ + size_t z; /**< z */ +}; + +/** + * CUDA 3D memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + + struct cudaExtent extent; /**< Requested memory copy size */ + enum cudaMemcpyKind kind; /**< Type of transfer */ +}; + +/** + * CUDA 3D cross-device memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DPeerParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + int srcDevice; /**< Source device */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + int dstDevice; /**< Destination device */ + + struct cudaExtent extent; /**< Requested memory copy size */ +}; + +/** + * CUDA Memset node parameters + */ +struct __device_builtin__ cudaMemsetParams { + void *dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width of the row in elements */ + size_t height; /**< Number of rows */ +}; + +/** + * Specifies performance hint with ::cudaAccessPolicyWindow for hitProp and missProp members. + */ +enum __device_builtin__ cudaAccessProperty { + cudaAccessPropertyNormal = 0, /**< Normal cache persistence. */ + cudaAccessPropertyStreaming = 1, /**< Streaming access is less likely to persit from cache. */ + cudaAccessPropertyPersisting = 2 /**< Persisting access is more likely to persist in cache.*/ +}; + +/** + * Specifies an access policy for a window, a contiguous extent of memory + * beginning at base_ptr and ending at base_ptr + num_bytes. + * Partition into many segments and assign segments such that. + * sum of "hit segments" / window == approx. ratio. + * sum of "miss segments" / window == approx 1-ratio. + * Segments and ratio specifications are fitted to the capabilities of + * the architecture. + * Accesses in a hit segment apply the hitProp access policy. + * Accesses in a miss segment apply the missProp access policy. + */ +struct __device_builtin__ cudaAccessPolicyWindow { + void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ + size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ + float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ + enum cudaAccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ + enum cudaAccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING. */ +}; + +#ifdef _WIN32 +#define CUDART_CB __stdcall +#else +#define CUDART_CB +#endif + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDART_CB *cudaHostFn_t)(void *userData); + +/** + * CUDA host node parameters + */ +struct __device_builtin__ cudaHostNodeParams { + cudaHostFn_t fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +}; + +/** + * Possible stream capture statuses returned by ::cudaStreamIsCapturing + */ +enum __device_builtin__ cudaStreamCaptureStatus { + cudaStreamCaptureStatusNone = 0, /**< Stream is not capturing */ + cudaStreamCaptureStatusActive = 1, /**< Stream is actively capturing */ + cudaStreamCaptureStatusInvalidated = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +}; + +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cudaStreamBeginCapture and ::cudaThreadExchangeStreamCaptureMode + */ +enum __device_builtin__ cudaStreamCaptureMode { + cudaStreamCaptureModeGlobal = 0, + cudaStreamCaptureModeThreadLocal = 1, + cudaStreamCaptureModeRelaxed = 2 +}; + +enum __device_builtin__ cudaSynchronizationPolicy { + cudaSyncPolicyAuto = 1, + cudaSyncPolicySpin = 2, + cudaSyncPolicyYield = 3, + cudaSyncPolicyBlockingSync = 4 +}; + +/** + * Cluster scheduling policies. These may be passed to ::cudaFuncSetAttribute + */ +enum __device_builtin__ cudaClusterSchedulingPolicy { + cudaClusterSchedulingPolicyDefault = 0, /**< the default policy */ + cudaClusterSchedulingPolicySpread = 1, /**< spread the blocks within a cluster to the SMs */ + cudaClusterSchedulingPolicyLoadBalancing = 2 /**< allow the hardware to load-balance the blocks in a cluster to the SMs */ +}; + +/** + * Flags for ::cudaStreamUpdateCaptureDependencies + */ +enum __device_builtin__ cudaStreamUpdateCaptureDependenciesFlags { + cudaStreamAddCaptureDependencies = 0x0, /**< Add new nodes to the dependency set */ + cudaStreamSetCaptureDependencies = 0x1 /**< Replace the dependency set with the new nodes */ +}; + +/** + * Flags for user objects for graphs + */ +enum __device_builtin__ cudaUserObjectFlags { + cudaUserObjectNoDestructorSync = 0x1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ +}; + +/** + * Flags for retaining user object references for graphs + */ +enum __device_builtin__ cudaUserObjectRetainFlags { + cudaGraphUserObjectMove = 0x1 /**< Transfer references from the caller rather than creating new references. */ +}; + +/** + * CUDA graphics interop resource + */ +struct cudaGraphicsResource; + +/** + * CUDA graphics interop register flags + */ +enum __device_builtin__ cudaGraphicsRegisterFlags +{ + cudaGraphicsRegisterFlagsNone = 0, /**< Default */ + cudaGraphicsRegisterFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsRegisterFlagsWriteDiscard = 2, /**< CUDA will only write to and will not read from this resource */ + cudaGraphicsRegisterFlagsSurfaceLoadStore = 4, /**< CUDA will bind this resource to a surface reference */ + cudaGraphicsRegisterFlagsTextureGather = 8 /**< CUDA will perform texture gather operations on this resource */ +}; + +/** + * CUDA graphics interop map flags + */ +enum __device_builtin__ cudaGraphicsMapFlags +{ + cudaGraphicsMapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaGraphicsMapFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsMapFlagsWriteDiscard = 2 /**< CUDA will only write to and will not read from this resource */ +}; + +/** + * CUDA graphics interop array indices for cube maps + */ +enum __device_builtin__ cudaGraphicsCubeFace +{ + cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */ + cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */ + cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */ + cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */ + cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */ + cudaGraphicsCubeFaceNegativeZ = 0x05 /**< Negative Z face of cubemap */ +}; + +/** + * CUDA resource types + */ +enum __device_builtin__ cudaResourceType +{ + cudaResourceTypeArray = 0x00, /**< Array resource */ + cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */ + cudaResourceTypeLinear = 0x02, /**< Linear resource */ + cudaResourceTypePitch2D = 0x03 /**< Pitch 2D resource */ +}; + +/** + * CUDA texture resource view formats + */ +enum __device_builtin__ cudaResourceViewFormat +{ + cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ + cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ + cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ + cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ + cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ + cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ + cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ + cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ + cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ + cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ + cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ + cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ + cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ + cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ + cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ + cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ + cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ + cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ + cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ + cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ + cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ + cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ + cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ + cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ + cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ + cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ + cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ + cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ +}; + +/** + * CUDA resource descriptor + */ +struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; +}; + +/** + * CUDA resource view descriptor + */ +struct __device_builtin__ cudaResourceViewDesc +{ + enum cudaResourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ +}; + +/** + * CUDA pointer attributes + */ +struct __device_builtin__ cudaPointerAttributes +{ + /** + * The type of memory - ::cudaMemoryTypeUnregistered, ::cudaMemoryTypeHost, + * ::cudaMemoryTypeDevice or ::cudaMemoryTypeManaged. + */ + enum cudaMemoryType type; + + /** + * The device against which the memory was allocated or registered. + * If the memory type is ::cudaMemoryTypeDevice then this identifies + * the device on which the memory referred physically resides. If + * the memory type is ::cudaMemoryTypeHost or::cudaMemoryTypeManaged then + * this identifies the device which was current when the memory was allocated + * or registered (and if that device is deinitialized then this allocation + * will vanish with that device's state). + */ + int device; + + /** + * The address which may be dereferenced on the current device to access + * the memory or NULL if no such address exists. + */ + void *devicePointer; + + /** + * The address which may be dereferenced on the host to access the + * memory or NULL if no such address exists. + * + * \note CUDA doesn't check if unregistered memory is allocated so this field + * may contain invalid pointer if an invalid pointer has been passed to CUDA. + */ + void *hostPointer; +}; + +/** + * CUDA function attributes + */ +struct __device_builtin__ cudaFuncAttributes +{ + /** + * The size in bytes of statically-allocated shared memory per block + * required by this function. This does not include dynamically-allocated + * shared memory requested by the user at runtime. + */ + size_t sharedSizeBytes; + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + size_t constSizeBytes; + + /** + * The size in bytes of local memory used by each thread of this function. + */ + size_t localSizeBytes; + + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + int maxThreadsPerBlock; + + /** + * The number of registers used by each thread of this function. + */ + int numRegs; + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + */ + int ptxVersion; + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. + */ + int binaryVersion; + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set. + */ + int cacheModeCA; + + /** + * The maximum size in bytes of dynamic shared memory per block for + * this function. Any launch must have a dynamic shared memory size + * smaller than this value. + */ + int maxDynamicSharedSizeBytes; + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the maximum shared memory. + * Refer to ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cudaFuncSetAttribute + */ + int preferredShmemCarveout; +}; + +/** + * CUDA function attributes that can be set using ::cudaFuncSetAttribute + */ +enum __device_builtin__ cudaFuncAttribute +{ + cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */ + cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split */ + cudaFuncAttributeClusterDimMustBeSet = 10, /**< Indicator to enforce valid cluster dimension specification on kernel launch */ + cudaFuncAttributeRequiredClusterWidth = 11, /**< Required cluster width */ + cudaFuncAttributeRequiredClusterHeight = 12, /**< Required cluster height */ + cudaFuncAttributeRequiredClusterDepth = 13, /**< Required cluster depth */ + cudaFuncAttributeNonPortableClusterSizeAllowed = 14, /**< Whether non-portable cluster scheduling policy is supported */ + cudaFuncAttributeClusterSchedulingPolicyPreference = 15, /**< Required cluster scheduling policy preference */ + cudaFuncAttributeMax +}; + +/** + * CUDA function cache configurations + */ +enum __device_builtin__ cudaFuncCache +{ + cudaFuncCachePreferNone = 0, /**< Default function cache configuration, no preference */ + cudaFuncCachePreferShared = 1, /**< Prefer larger shared memory and smaller L1 cache */ + cudaFuncCachePreferL1 = 2, /**< Prefer larger L1 cache and smaller shared memory */ + cudaFuncCachePreferEqual = 3 /**< Prefer equal size L1 cache and shared memory */ +}; + +/** + * CUDA shared memory configuration + */ + +enum __device_builtin__ cudaSharedMemConfig +{ + cudaSharedMemBankSizeDefault = 0, + cudaSharedMemBankSizeFourByte = 1, + cudaSharedMemBankSizeEightByte = 2 +}; + +/** + * Shared memory carveout configurations. These may be passed to cudaFuncSetAttribute + */ +enum __device_builtin__ cudaSharedCarveout { + cudaSharedmemCarveoutDefault = -1, /**< No preference for shared memory or L1 (default) */ + cudaSharedmemCarveoutMaxShared = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + cudaSharedmemCarveoutMaxL1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ +}; + +/** + * CUDA device compute modes + */ +enum __device_builtin__ cudaComputeMode +{ + cudaComputeModeDefault = 0, /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusive = 1, /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */ + cudaComputeModeProhibited = 2, /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusiveProcess = 3 /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */ +}; + +/** + * CUDA Limits + */ +enum __device_builtin__ cudaLimit +{ + cudaLimitStackSize = 0x00, /**< GPU thread stack size */ + cudaLimitPrintfFifoSize = 0x01, /**< GPU printf FIFO size */ + cudaLimitMallocHeapSize = 0x02, /**< GPU malloc heap size */ + cudaLimitDevRuntimeSyncDepth = 0x03, /**< GPU device runtime synchronize depth */ + cudaLimitDevRuntimePendingLaunchCount = 0x04, /**< GPU device runtime pending launch count */ + cudaLimitMaxL2FetchGranularity = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + cudaLimitPersistingL2CacheSize = 0x06 /**< A size in bytes for L2 persisting lines cache size */ +}; + +/** + * CUDA Memory Advise values + */ +enum __device_builtin__ cudaMemoryAdvise +{ + cudaMemAdviseSetReadMostly = 1, /**< Data will mostly be read and only occassionally be written to */ + cudaMemAdviseUnsetReadMostly = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */ + cudaMemAdviseSetPreferredLocation = 3, /**< Set the preferred location for the data as the specified device */ + cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */ + cudaMemAdviseSetAccessedBy = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + cudaMemAdviseUnsetAccessedBy = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +}; + +/** + * CUDA range attributes + */ +enum __device_builtin__ cudaMemRangeAttribute +{ + cudaMemRangeAttributeReadMostly = 1, /**< Whether the range will mostly be read and only occassionally be written to */ + cudaMemRangeAttributePreferredLocation = 2, /**< The preferred location of the range */ + cudaMemRangeAttributeAccessedBy = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */ + cudaMemRangeAttributeLastPrefetchLocation = 4 /**< The last location to which the range was prefetched */ +}; + +/** + * CUDA Profiler Output modes + */ +enum __device_builtin__ cudaOutputMode +{ + cudaKeyValuePair = 0x00, /**< Output mode Key-Value pair format. */ + cudaCSV = 0x01 /**< Output mode Comma separated values format. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes APIs supported on the device + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesOptions { + cudaFlushGPUDirectRDMAWritesOptionHost = 1<<0, /**< ::cudaDeviceFlushGPUDirectRDMAWrites() and its CUDA Driver API counterpart are supported on the device. */ + cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the CUDA device. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes ordering features of the device + */ +enum __device_builtin__ cudaGPUDirectRDMAWritesOrdering { + cudaGPUDirectRDMAWritesOrderingNone = 0, /**< The device does not natively support ordering of GPUDirect RDMA writes. ::cudaFlushGPUDirectRDMAWrites() can be leveraged if supported. */ + cudaGPUDirectRDMAWritesOrderingOwner = 100, /**< Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not. */ + cudaGPUDirectRDMAWritesOrderingAllDevices = 200 /**< Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes scopes + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesScope { + cudaFlushGPUDirectRDMAWritesToOwner = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ + cudaFlushGPUDirectRDMAWritesToAllDevices = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes targets + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesTarget { + cudaFlushGPUDirectRDMAWritesTargetCurrentDevice /**< Sets the target for ::cudaDeviceFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ +}; + + +/** + * CUDA device attributes + */ +enum __device_builtin__ cudaDeviceAttr +{ + cudaDevAttrMaxThreadsPerBlock = 1, /**< Maximum number of threads per block */ + cudaDevAttrMaxBlockDimX = 2, /**< Maximum block dimension X */ + cudaDevAttrMaxBlockDimY = 3, /**< Maximum block dimension Y */ + cudaDevAttrMaxBlockDimZ = 4, /**< Maximum block dimension Z */ + cudaDevAttrMaxGridDimX = 5, /**< Maximum grid dimension X */ + cudaDevAttrMaxGridDimY = 6, /**< Maximum grid dimension Y */ + cudaDevAttrMaxGridDimZ = 7, /**< Maximum grid dimension Z */ + cudaDevAttrMaxSharedMemoryPerBlock = 8, /**< Maximum shared memory available per block in bytes */ + cudaDevAttrTotalConstantMemory = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + cudaDevAttrWarpSize = 10, /**< Warp size in threads */ + cudaDevAttrMaxPitch = 11, /**< Maximum pitch in bytes allowed by memory copies */ + cudaDevAttrMaxRegistersPerBlock = 12, /**< Maximum number of 32-bit registers available per block */ + cudaDevAttrClockRate = 13, /**< Peak clock frequency in kilohertz */ + cudaDevAttrTextureAlignment = 14, /**< Alignment requirement for textures */ + cudaDevAttrGpuOverlap = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ + cudaDevAttrMultiProcessorCount = 16, /**< Number of multiprocessors on device */ + cudaDevAttrKernelExecTimeout = 17, /**< Specifies whether there is a run time limit on kernels */ + cudaDevAttrIntegrated = 18, /**< Device is integrated with host memory */ + cudaDevAttrCanMapHostMemory = 19, /**< Device can map host memory into CUDA address space */ + cudaDevAttrComputeMode = 20, /**< Compute mode (See ::cudaComputeMode for details) */ + cudaDevAttrMaxTexture1DWidth = 21, /**< Maximum 1D texture width */ + cudaDevAttrMaxTexture2DWidth = 22, /**< Maximum 2D texture width */ + cudaDevAttrMaxTexture2DHeight = 23, /**< Maximum 2D texture height */ + cudaDevAttrMaxTexture3DWidth = 24, /**< Maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeight = 25, /**< Maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepth = 26, /**< Maximum 3D texture depth */ + cudaDevAttrMaxTexture2DLayeredWidth = 27, /**< Maximum 2D layered texture width */ + cudaDevAttrMaxTexture2DLayeredHeight = 28, /**< Maximum 2D layered texture height */ + cudaDevAttrMaxTexture2DLayeredLayers = 29, /**< Maximum layers in a 2D layered texture */ + cudaDevAttrSurfaceAlignment = 30, /**< Alignment requirement for surfaces */ + cudaDevAttrConcurrentKernels = 31, /**< Device can possibly execute multiple kernels concurrently */ + cudaDevAttrEccEnabled = 32, /**< Device has ECC support enabled */ + cudaDevAttrPciBusId = 33, /**< PCI bus ID of the device */ + cudaDevAttrPciDeviceId = 34, /**< PCI device ID of the device */ + cudaDevAttrTccDriver = 35, /**< Device is using TCC driver model */ + cudaDevAttrMemoryClockRate = 36, /**< Peak memory clock frequency in kilohertz */ + cudaDevAttrGlobalMemoryBusWidth = 37, /**< Global memory bus width in bits */ + cudaDevAttrL2CacheSize = 38, /**< Size of L2 cache in bytes */ + cudaDevAttrMaxThreadsPerMultiProcessor = 39, /**< Maximum resident threads per multiprocessor */ + cudaDevAttrAsyncEngineCount = 40, /**< Number of asynchronous engines */ + cudaDevAttrUnifiedAddressing = 41, /**< Device shares a unified address space with the host */ + cudaDevAttrMaxTexture1DLayeredWidth = 42, /**< Maximum 1D layered texture width */ + cudaDevAttrMaxTexture1DLayeredLayers = 43, /**< Maximum layers in a 1D layered texture */ + cudaDevAttrMaxTexture2DGatherWidth = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture2DGatherHeight = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture3DWidthAlt = 47, /**< Alternate maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeightAlt = 48, /**< Alternate maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepthAlt = 49, /**< Alternate maximum 3D texture depth */ + cudaDevAttrPciDomainId = 50, /**< PCI domain ID of the device */ + cudaDevAttrTexturePitchAlignment = 51, /**< Pitch alignment requirement for textures */ + cudaDevAttrMaxTextureCubemapWidth = 52, /**< Maximum cubemap texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredWidth = 53, /**< Maximum cubemap layered texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */ + cudaDevAttrMaxSurface1DWidth = 55, /**< Maximum 1D surface width */ + cudaDevAttrMaxSurface2DWidth = 56, /**< Maximum 2D surface width */ + cudaDevAttrMaxSurface2DHeight = 57, /**< Maximum 2D surface height */ + cudaDevAttrMaxSurface3DWidth = 58, /**< Maximum 3D surface width */ + cudaDevAttrMaxSurface3DHeight = 59, /**< Maximum 3D surface height */ + cudaDevAttrMaxSurface3DDepth = 60, /**< Maximum 3D surface depth */ + cudaDevAttrMaxSurface1DLayeredWidth = 61, /**< Maximum 1D layered surface width */ + cudaDevAttrMaxSurface1DLayeredLayers = 62, /**< Maximum layers in a 1D layered surface */ + cudaDevAttrMaxSurface2DLayeredWidth = 63, /**< Maximum 2D layered surface width */ + cudaDevAttrMaxSurface2DLayeredHeight = 64, /**< Maximum 2D layered surface height */ + cudaDevAttrMaxSurface2DLayeredLayers = 65, /**< Maximum layers in a 2D layered surface */ + cudaDevAttrMaxSurfaceCubemapWidth = 66, /**< Maximum cubemap surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67, /**< Maximum cubemap layered surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */ + cudaDevAttrMaxTexture1DLinearWidth = 69, /**< Maximum 1D linear texture width */ + cudaDevAttrMaxTexture2DLinearWidth = 70, /**< Maximum 2D linear texture width */ + cudaDevAttrMaxTexture2DLinearHeight = 71, /**< Maximum 2D linear texture height */ + cudaDevAttrMaxTexture2DLinearPitch = 72, /**< Maximum 2D linear texture pitch in bytes */ + cudaDevAttrMaxTexture2DMipmappedWidth = 73, /**< Maximum mipmapped 2D texture width */ + cudaDevAttrMaxTexture2DMipmappedHeight = 74, /**< Maximum mipmapped 2D texture height */ + cudaDevAttrComputeCapabilityMajor = 75, /**< Major compute capability version number */ + cudaDevAttrComputeCapabilityMinor = 76, /**< Minor compute capability version number */ + cudaDevAttrMaxTexture1DMipmappedWidth = 77, /**< Maximum mipmapped 1D texture width */ + cudaDevAttrStreamPrioritiesSupported = 78, /**< Device supports stream priorities */ + cudaDevAttrGlobalL1CacheSupported = 79, /**< Device supports caching globals in L1 */ + cudaDevAttrLocalL1CacheSupported = 80, /**< Device supports caching locals in L1 */ + cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + cudaDevAttrMaxRegistersPerMultiprocessor = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + cudaDevAttrManagedMemory = 83, /**< Device can allocate managed memory on this system */ + cudaDevAttrIsMultiGpuBoard = 84, /**< Device is on a multi-GPU board */ + cudaDevAttrMultiGpuBoardGroupID = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */ + cudaDevAttrHostNativeAtomicSupported = 86, /**< Link between the device and the host supports native atomic operations */ + cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + cudaDevAttrPageableMemoryAccess = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + cudaDevAttrConcurrentManagedAccess = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + cudaDevAttrComputePreemptionSupported = 90, /**< Device supports Compute Preemption */ + cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + cudaDevAttrReserved92 = 92, + cudaDevAttrReserved93 = 93, + cudaDevAttrReserved94 = 94, + cudaDevAttrCooperativeLaunch = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/ + cudaDevAttrCooperativeMultiDeviceLaunch = 96, /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */ + cudaDevAttrMaxSharedMemoryPerBlockOptin = 97, /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */ + cudaDevAttrCanFlushRemoteWrites = 98, /**< Device supports flushing of outstanding remote writes. */ + cudaDevAttrHostRegisterSupported = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100, /**< Device accesses pageable memory via the host's page tables. */ + cudaDevAttrDirectManagedMemAccessFromHost = 101, /**< Host can directly access managed memory on the device without migration. */ + cudaDevAttrMaxBlocksPerMultiprocessor = 106, /**< Maximum number of blocks per multiprocessor */ + cudaDevAttrMaxPersistingL2CacheSize = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ + cudaDevAttrMaxAccessPolicyWindowSize = 109, /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */ + cudaDevAttrReservedSharedMemoryPerBlock = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ + cudaDevAttrSparseCudaArraySupported = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ + cudaDevAttrHostRegisterReadOnlySupported = 113, /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */ + cudaDevAttrTimelineSemaphoreInteropSupported = 114, /**< External timeline semaphore interop is supported on the device */ + cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114, /**< Deprecated, External timeline semaphore interop is supported on the device */ + cudaDevAttrMemoryPoolsSupported = 115, /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */ + cudaDevAttrGPUDirectRDMASupported = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ + cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */ + cudaDevAttrGPUDirectRDMAWritesOrdering = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */ + cudaDevAttrMemoryPoolSupportedHandleTypes = 119, /**< Handle types supported with mempool based IPC */ + cudaDevAttrClusterLaunch = 120, /**< Indicates device supports cluster launch */ + cudaDevAttrDeferredMappingCudaArraySupported = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ + cudaDevAttrMax +}; + +/** + * CUDA memory pool attributes + */ +enum __device_builtin__ cudaMemPoolAttr +{ + /** + * (value type = int) + * Allow cuMemAllocAsync to use memory asynchronously freed + * in another streams as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + */ + cudaMemPoolReuseFollowEventDependencies = 0x1, + + /** + * (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + */ + cudaMemPoolReuseAllowOpportunistic = 0x2, + + /** + * (value type = int) + * Allow cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by cuFreeAsync (default enabled). + */ + cudaMemPoolReuseAllowInternalDependencies = 0x3, + + + /** + * (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + */ + cudaMemPoolAttrReleaseThreshold = 0x4, + + /** + * (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + */ + cudaMemPoolAttrReservedMemCurrent = 0x5, + + /** + * (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. High watermark can only be reset to zero. + */ + cudaMemPoolAttrReservedMemHigh = 0x6, + + /** + * (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + */ + cudaMemPoolAttrUsedMemCurrent = 0x7, + + /** + * (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application since + * the last time it was reset. High watermark can only be reset to zero. + */ + cudaMemPoolAttrUsedMemHigh = 0x8 +}; + +/** + * Specifies the type of location + */ +enum __device_builtin__ cudaMemLocationType { + cudaMemLocationTypeInvalid = 0, + cudaMemLocationTypeDevice = 1 /**< Location is a device location, thus id is a device ordinal */ +}; + +/** + * Specifies a memory location. + * + * To specify a gpu, set type = ::cudaMemLocationTypeDevice and set id = the gpu's device ordinal. + */ +struct __device_builtin__ cudaMemLocation { + enum cudaMemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +}; + +/** + * Specifies the memory protection flags for mapping. + */ +enum __device_builtin__ cudaMemAccessFlags { + cudaMemAccessFlagsProtNone = 0, /**< Default, make the address range not accessible */ + cudaMemAccessFlagsProtRead = 1, /**< Make the address range read accessible */ + cudaMemAccessFlagsProtReadWrite = 3 /**< Make the address range read-write accessible */ +}; + +/** + * Memory access descriptor + */ +struct __device_builtin__ cudaMemAccessDesc { + struct cudaMemLocation location; /**< Location on which the request is to change it's accessibility */ + enum cudaMemAccessFlags flags; /**< ::CUmemProt accessibility flags to set on the request */ +}; + +/** + * Defines the allocation types available + */ +enum __device_builtin__ cudaMemAllocationType { + cudaMemAllocationTypeInvalid = 0x0, + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + cudaMemAllocationTypePinned = 0x1, + cudaMemAllocationTypeMax = 0x7FFFFFFF +}; + +/** + * Flags for specifying particular handle types + */ +enum __device_builtin__ cudaMemAllocationHandleType { + cudaMemHandleTypeNone = 0x0, /**< Does not allow any export mechanism. > */ + cudaMemHandleTypePosixFileDescriptor = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + cudaMemHandleTypeWin32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + cudaMemHandleTypeWin32Kmt = 0x4 /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ +}; + +/** + * Specifies the properties of allocations made from the pool. + */ +struct __device_builtin__ cudaMemPoolProps { + enum cudaMemAllocationType allocType; /**< Allocation type. Currently must be specified as cudaMemAllocationTypePinned */ + enum cudaMemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ + struct cudaMemLocation location; /**< Location allocations should reside. */ + /** + * Windows-specific LPSECURITYATTRIBUTES required when + * ::cudaMemHandleTypeWin32 is specified. This security attribute defines + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32SecurityAttributes; + unsigned char reserved[64]; /**< reserved for future use, must be 0 */ +}; + +/** + * Opaque data for exporting a pool allocation + */ +struct __device_builtin__ cudaMemPoolPtrExportData { + unsigned char reserved[64]; +}; + +/** + * Memory allocation node parameters + */ +struct __device_builtin__ cudaMemAllocNodeParams { + /** + * in: location where the allocation should reside (specified in ::location). + * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported. + */ + struct cudaMemPoolProps poolProps; /**< in: array of memory access descriptors. Used to describe peer GPU access */ + const struct cudaMemAccessDesc *accessDescs; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ + size_t accessDescCount; /**< in: Number of `accessDescs`s */ + size_t bytesize; /**< in: size in bytes of the requested allocation */ + void *dptr; /**< out: address of the allocation returned by CUDA */ +}; + +/** + * Graph memory attributes + */ +enum __device_builtin__ cudaGraphMemAttributeType { + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently associated with graphs. + */ + cudaGraphMemAttrUsedMemCurrent = 0x0, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + */ + cudaGraphMemAttrUsedMemHigh = 0x1, + + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + cudaGraphMemAttrReservedMemCurrent = 0x2, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + cudaGraphMemAttrReservedMemHigh = 0x3 +}; + +/** + * CUDA device P2P attributes + */ + +enum __device_builtin__ cudaDeviceP2PAttr { + cudaDevP2PAttrPerformanceRank = 1, /**< A relative value indicating the performance of the link between two devices */ + cudaDevP2PAttrAccessSupported = 2, /**< Peer access is enabled */ + cudaDevP2PAttrNativeAtomicSupported = 3, /**< Native atomic operation over the link supported */ + cudaDevP2PAttrCudaArrayAccessSupported = 4 /**< Accessing CUDA arrays over the link supported */ +}; + +/** + * CUDA UUID types + */ +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED +struct __device_builtin__ CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +}; +typedef __device_builtin__ struct CUuuid_st CUuuid; +#endif +typedef __device_builtin__ struct CUuuid_st cudaUUID_t; + +/** + * CUDA device properties + */ +struct __device_builtin__ cudaDeviceProp +{ + char name[256]; /**< ASCII string identifying device */ + cudaUUID_t uuid; /**< 16-byte unique identifier */ + char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */ + unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */ + size_t totalGlobalMem; /**< Global memory available on device in bytes */ + size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int regsPerBlock; /**< 32-bit registers available per block */ + int warpSize; /**< Warp size in threads */ + size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int clockRate; /**< Clock frequency in kilohertz */ + size_t totalConstMem; /**< Constant memory available on device in bytes */ + int major; /**< Major compute capability */ + int minor; /**< Minor compute capability */ + size_t textureAlignment; /**< Alignment requirement for textures */ + size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */ + int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */ + int multiProcessorCount; /**< Number of multiprocessors on device */ + int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */ + int integrated; /**< Device is integrated as opposed to discrete */ + int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */ + int computeMode; /**< Compute mode (See ::cudaComputeMode) */ + int maxTexture1D; /**< Maximum 1D texture size */ + int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */ + int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ + int maxTexture2D[2]; /**< Maximum 2D texture dimensions */ + int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */ + int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */ + int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */ + int maxTexture3D[3]; /**< Maximum 3D texture dimensions */ + int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */ + int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */ + int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */ + int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */ + int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */ + int maxSurface1D; /**< Maximum 1D surface size */ + int maxSurface2D[2]; /**< Maximum 2D surface dimensions */ + int maxSurface3D[3]; /**< Maximum 3D surface dimensions */ + int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */ + int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */ + int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */ + int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */ + size_t surfaceAlignment; /**< Alignment requirements for surfaces */ + int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */ + int ECCEnabled; /**< Device has ECC support enabled */ + int pciBusID; /**< PCI bus ID of the device */ + int pciDeviceID; /**< PCI device ID of the device */ + int pciDomainID; /**< PCI domain ID of the device */ + int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */ + int asyncEngineCount; /**< Number of asynchronous engines */ + int unifiedAddressing; /**< Device shares a unified address space with the host */ + int memoryClockRate; /**< Peak memory clock frequency in kilohertz */ + int memoryBusWidth; /**< Global memory bus width in bits */ + int l2CacheSize; /**< Size of L2 cache in bytes */ + int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */ + int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */ + int streamPrioritiesSupported; /**< Device supports stream priorities */ + int globalL1CacheSupported; /**< Device supports caching globals in L1 */ + int localL1CacheSupported; /**< Device supports caching locals in L1 */ + size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */ + int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */ + int managedMemory; /**< Device supports allocating managed memory on this system */ + int isMultiGpuBoard; /**< Device is on a multi-GPU board */ + int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */ + int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */ + int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */ + int computePreemptionSupported; /**< Device supports Compute Preemption */ + int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */ + int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */ + int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */ + size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */ + int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */ + int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */ + int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */ + int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */ + size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */ +}; + +#define cudaDevicePropDontCare \ + { \ + {'\0'}, /* char name[256]; */ \ + {{0}}, /* cudaUUID_t uuid; */ \ + {'\0'}, /* char luid[8]; */ \ + 0, /* unsigned int luidDeviceNodeMask */ \ + 0, /* size_t totalGlobalMem; */ \ + 0, /* size_t sharedMemPerBlock; */ \ + 0, /* int regsPerBlock; */ \ + 0, /* int warpSize; */ \ + 0, /* size_t memPitch; */ \ + 0, /* int maxThreadsPerBlock; */ \ + {0, 0, 0}, /* int maxThreadsDim[3]; */ \ + {0, 0, 0}, /* int maxGridSize[3]; */ \ + 0, /* int clockRate; */ \ + 0, /* size_t totalConstMem; */ \ + -1, /* int major; */ \ + -1, /* int minor; */ \ + 0, /* size_t textureAlignment; */ \ + 0, /* size_t texturePitchAlignment */ \ + -1, /* int deviceOverlap; */ \ + 0, /* int multiProcessorCount; */ \ + 0, /* int kernelExecTimeoutEnabled */ \ + 0, /* int integrated */ \ + 0, /* int canMapHostMemory */ \ + 0, /* int computeMode */ \ + 0, /* int maxTexture1D */ \ + 0, /* int maxTexture1DMipmap */ \ + 0, /* int maxTexture1DLinear */ \ + {0, 0}, /* int maxTexture2D[2] */ \ + {0, 0}, /* int maxTexture2DMipmap[2] */ \ + {0, 0, 0}, /* int maxTexture2DLinear[3] */ \ + {0, 0}, /* int maxTexture2DGather[2] */ \ + {0, 0, 0}, /* int maxTexture3D[3] */ \ + {0, 0, 0}, /* int maxTexture3DAlt[3] */ \ + 0, /* int maxTextureCubemap */ \ + {0, 0}, /* int maxTexture1DLayered[2] */ \ + {0, 0, 0}, /* int maxTexture2DLayered[3] */ \ + {0, 0}, /* int maxTextureCubemapLayered[2] */ \ + 0, /* int maxSurface1D */ \ + {0, 0}, /* int maxSurface2D[2] */ \ + {0, 0, 0}, /* int maxSurface3D[3] */ \ + {0, 0}, /* int maxSurface1DLayered[2] */ \ + {0, 0, 0}, /* int maxSurface2DLayered[3] */ \ + 0, /* int maxSurfaceCubemap */ \ + {0, 0}, /* int maxSurfaceCubemapLayered[2] */ \ + 0, /* size_t surfaceAlignment */ \ + 0, /* int concurrentKernels */ \ + 0, /* int ECCEnabled */ \ + 0, /* int pciBusID */ \ + 0, /* int pciDeviceID */ \ + 0, /* int pciDomainID */ \ + 0, /* int tccDriver */ \ + 0, /* int asyncEngineCount */ \ + 0, /* int unifiedAddressing */ \ + 0, /* int memoryClockRate */ \ + 0, /* int memoryBusWidth */ \ + 0, /* int l2CacheSize */ \ + 0, /* int persistingL2CacheMaxSize */ \ + 0, /* int maxThreadsPerMultiProcessor */ \ + 0, /* int streamPrioritiesSupported */ \ + 0, /* int globalL1CacheSupported */ \ + 0, /* int localL1CacheSupported */ \ + 0, /* size_t sharedMemPerMultiprocessor; */ \ + 0, /* int regsPerMultiprocessor; */ \ + 0, /* int managedMemory */ \ + 0, /* int isMultiGpuBoard */ \ + 0, /* int multiGpuBoardGroupID */ \ + 0, /* int hostNativeAtomicSupported */ \ + 0, /* int singleToDoublePrecisionPerfRatio */ \ + 0, /* int pageableMemoryAccess */ \ + 0, /* int concurrentManagedAccess */ \ + 0, /* int computePreemptionSupported */ \ + 0, /* int canUseHostPointerForRegisteredMem */ \ + 0, /* int cooperativeLaunch */ \ + 0, /* int cooperativeMultiDeviceLaunch */ \ + 0, /* size_t sharedMemPerBlockOptin */ \ + 0, /* int pageableMemoryAccessUsesHostPageTables */ \ + 0, /* int directManagedMemAccessFromHost */ \ + 0, /* int accessPolicyMaxWindowSize */ \ + 0, /* size_t reservedSharedMemPerBlock */ \ + } /**< Empty device properties */ + +/** + * CUDA IPC Handle Size + */ +#define CUDA_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcEventHandle_t; + +/** + * CUDA IPC memory handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcMemHandle_t; + +/** + * External memory handle types + */ +enum __device_builtin__ cudaExternalMemoryHandleType { + /** + * Handle is an opaque file descriptor + */ + cudaExternalMemoryHandleTypeOpaqueFd = 1, + /** + * Handle is an opaque shared NT handle + */ + cudaExternalMemoryHandleTypeOpaqueWin32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + cudaExternalMemoryHandleTypeOpaqueWin32Kmt = 3, + /** + * Handle is a D3D12 heap object + */ + cudaExternalMemoryHandleTypeD3D12Heap = 4, + /** + * Handle is a D3D12 committed resource + */ + cudaExternalMemoryHandleTypeD3D12Resource = 5, + /** + * Handle is a shared NT handle to a D3D11 resource + */ + cudaExternalMemoryHandleTypeD3D11Resource = 6, + /** + * Handle is a globally shared handle to a D3D11 resource + */ + cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7, + /** + * Handle is an NvSciBuf object + */ + cudaExternalMemoryHandleTypeNvSciBuf = 8 +}; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define cudaExternalMemoryDedicated 0x1 + +/** When the /p flags parameter of ::cudaExternalSemaphoreSignalParams + * contains this flag, it indicates that signaling an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define cudaExternalSemaphoreSignalSkipNvSciBufMemSync 0x01 + +/** When the /p flags parameter of ::cudaExternalSemaphoreWaitParams + * contains this flag, it indicates that waiting an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define cudaExternalSemaphoreWaitSkipNvSciBufMemSync 0x02 + +/** + * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application need signaler specific NvSciSyncAttr + * to be filled by ::cudaDeviceGetNvSciSyncAttributes. + */ +#define cudaNvSciSyncAttrSignal 0x1 + +/** + * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application need waiter specific NvSciSyncAttr + * to be filled by ::cudaDeviceGetNvSciSyncAttributes. + */ +#define cudaNvSciSyncAttrWait 0x2 + +/** + * External memory handle descriptor + */ +struct __device_builtin__ cudaExternalMemoryHandleDesc { + /** + * Type of the handle + */ + enum cudaExternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::cudaExternalMemoryHandleTypeOpaqueFd + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalMemoryHandleTypeOpaqueWin32 + * - ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt + * - ::cudaExternalMemoryHandleTypeD3D12Heap + * - ::cudaExternalMemoryHandleTypeD3D12Resource + * - ::cudaExternalMemoryHandleTypeD3D11Resource + * - ::cudaExternalMemoryHandleTypeD3D11ResourceKmt + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt + * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * A handle representing NvSciBuf Object. Valid when type + * is ::cudaExternalMemoryHandleTypeNvSciBuf + */ + const void *nvSciBufObject; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::cudaExternalMemoryDedicated + */ + unsigned int flags; +}; + +/** + * External memory buffer descriptor + */ +struct __device_builtin__ cudaExternalMemoryBufferDesc { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; +}; + +/** + * External memory mipmap descriptor + */ +struct __device_builtin__ cudaExternalMemoryMipmappedArrayDesc { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format of base level of the mipmap chain + */ + struct cudaChannelFormatDesc formatDesc; + /** + * Dimensions of base level of the mipmap chain + */ + struct cudaExtent extent; + /** + * Flags associated with CUDA mipmapped arrays. + * See ::cudaMallocMipmappedArray + */ + unsigned int flags; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; +}; + +/** + * External semaphore handle types + */ +enum __device_builtin__ cudaExternalSemaphoreHandleType { + /** + * Handle is an opaque file descriptor + */ + cudaExternalSemaphoreHandleTypeOpaqueFd = 1, + /** + * Handle is an opaque shared NT handle + */ + cudaExternalSemaphoreHandleTypeOpaqueWin32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + cudaExternalSemaphoreHandleTypeD3D12Fence = 4, + /** + * Handle is a shared NT handle referencing a D3D11 fence object + */ + cudaExternalSemaphoreHandleTypeD3D11Fence = 5, + /** + * Opaque handle to NvSciSync Object + */ + cudaExternalSemaphoreHandleTypeNvSciSync = 6, + /** + * Handle is a shared NT handle referencing a D3D11 keyed mutex object + */ + cudaExternalSemaphoreHandleTypeKeyedMutex = 7, + /** + * Handle is a shared KMT handle referencing a D3D11 keyed mutex object + */ + cudaExternalSemaphoreHandleTypeKeyedMutexKmt = 8, + /** + * Handle is an opaque handle file descriptor referencing a timeline semaphore + */ + cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9, + /** + * Handle is an opaque handle file descriptor referencing a timeline semaphore + */ + cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10 +}; + +/** + * External semaphore handle descriptor + */ +struct __device_builtin__ cudaExternalSemaphoreHandleDesc { + /** + * Type of the handle + */ + enum cudaExternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalSemaphoreHandleTypeOpaqueFd + * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32 + * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * - ::cudaExternalSemaphoreHandleTypeD3D12Fence + * - ::cudaExternalSemaphoreHandleTypeD3D11Fence + * - ::cudaExternalSemaphoreHandleTypeKeyedMutex + * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * Valid NvSciSyncObj. Must be non NULL + */ + const void* nvSciSyncObj; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; +}; + +/** + * External semaphore signal parameters(deprecated) + */ +struct __device_builtin__ cudaExternalSemaphoreSignalParams_v1 { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /* + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while signaling the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; +}; + +/** +* External semaphore wait parameters(deprecated) +*/ +struct __device_builtin__ cudaExternalSemaphoreWaitParams_v1 { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while waiting for the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; +}; + +/** + * External semaphore signal parameters, compatible with driver type + */ +struct __device_builtin__ cudaExternalSemaphoreSignalParams{ + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /* + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + unsigned int reserved[12]; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while signaling the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +}; + +/** + * External semaphore wait parameters, compatible with driver type + */ +struct __device_builtin__ cudaExternalSemaphoreWaitParams { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + unsigned int reserved[10]; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while waiting for the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +}; + +/******************************************************************************* +* * +* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * +* * +*******************************************************************************/ + +/** + * CUDA Error types + */ +typedef __device_builtin__ enum cudaError cudaError_t; + +/** + * CUDA stream + */ +typedef __device_builtin__ struct CUstream_st *cudaStream_t; + +/** + * CUDA event types + */ +typedef __device_builtin__ struct CUevent_st *cudaEvent_t; + +/** + * CUDA graphics resource types + */ +typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t; + +/** + * CUDA output file modes + */ +typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t; + +/** + * CUDA external memory + */ +typedef __device_builtin__ struct CUexternalMemory_st *cudaExternalMemory_t; + +/** + * CUDA external semaphore + */ +typedef __device_builtin__ struct CUexternalSemaphore_st *cudaExternalSemaphore_t; + +/** + * CUDA graph + */ +typedef __device_builtin__ struct CUgraph_st *cudaGraph_t; + +/** + * CUDA graph node. + */ +typedef __device_builtin__ struct CUgraphNode_st *cudaGraphNode_t; + +/** + * CUDA user object for graphs + */ +typedef __device_builtin__ struct CUuserObject_st *cudaUserObject_t; + +/** + * CUDA function + */ +typedef __device_builtin__ struct CUfunc_st *cudaFunction_t; + +/** + * CUDA memory pool + */ +typedef __device_builtin__ struct CUmemPoolHandle_st *cudaMemPool_t; + +/** + * CUDA cooperative group scope + */ +enum __device_builtin__ cudaCGScope { + cudaCGScopeInvalid = 0, /**< Invalid cooperative group scope */ + cudaCGScopeGrid = 1, /**< Scope represented by a grid_group */ + cudaCGScopeMultiGrid = 2 /**< Scope represented by a multi_grid_group */ +}; + +/** + * CUDA launch parameters + */ +struct __device_builtin__ cudaLaunchParams +{ + void *func; /**< Device function symbol */ + dim3 gridDim; /**< Grid dimentions */ + dim3 blockDim; /**< Block dimentions */ + void **args; /**< Arguments */ + size_t sharedMem; /**< Shared memory */ + cudaStream_t stream; /**< Stream identifier */ +}; + +/** + * CUDA GPU kernel node parameters + */ +struct __device_builtin__ cudaKernelNodeParams { + void* func; /**< Kernel to launch */ + dim3 gridDim; /**< Grid dimensions */ + dim3 blockDim; /**< Block dimensions */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to individual kernel arguments*/ + void **extra; /**< Pointer to kernel arguments in the "extra" format */ +}; + +/** + * External semaphore signal node parameters + */ +struct __device_builtin__ cudaExternalSemaphoreSignalNodeParams { + cudaExternalSemaphore_t* extSemArray; /**< Array of external semaphore handles. */ + const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +}; + +/** + * External semaphore wait node parameters + */ +struct __device_builtin__ cudaExternalSemaphoreWaitNodeParams { + cudaExternalSemaphore_t* extSemArray; /**< Array of external semaphore handles. */ + const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +}; + +/** +* CUDA Graph node types +*/ +enum __device_builtin__ cudaGraphNodeType { + cudaGraphNodeTypeKernel = 0x00, /**< GPU kernel node */ + cudaGraphNodeTypeMemcpy = 0x01, /**< Memcpy node */ + cudaGraphNodeTypeMemset = 0x02, /**< Memset node */ + cudaGraphNodeTypeHost = 0x03, /**< Host (executable) node */ + cudaGraphNodeTypeGraph = 0x04, /**< Node which executes an embedded graph */ + cudaGraphNodeTypeEmpty = 0x05, /**< Empty (no-op) node */ + cudaGraphNodeTypeWaitEvent = 0x06, /**< External event wait node */ + cudaGraphNodeTypeEventRecord = 0x07, /**< External event record node */ + cudaGraphNodeTypeExtSemaphoreSignal = 0x08, /**< External semaphore signal node */ + cudaGraphNodeTypeExtSemaphoreWait = 0x09, /**< External semaphore wait node */ + cudaGraphNodeTypeMemAlloc = 0x0a, /**< Memory allocation node */ + cudaGraphNodeTypeMemFree = 0x0b, /**< Memory free node */ + cudaGraphNodeTypeCount +}; + +/** + * CUDA executable (launchable) graph + */ +typedef struct CUgraphExec_st* cudaGraphExec_t; + +/** +* CUDA Graph Update error types +*/ +enum __device_builtin__ cudaGraphExecUpdateResult { + cudaGraphExecUpdateSuccess = 0x0, /**< The update succeeded */ + cudaGraphExecUpdateError = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ + cudaGraphExecUpdateErrorTopologyChanged = 0x2, /**< The update failed because the topology changed */ + cudaGraphExecUpdateErrorNodeTypeChanged = 0x3, /**< The update failed because a node type changed */ + cudaGraphExecUpdateErrorFunctionChanged = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ + cudaGraphExecUpdateErrorParametersChanged = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ + cudaGraphExecUpdateErrorNotSupported = 0x6, /**< The update failed because something about the node is not supported */ + cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ + cudaGraphExecUpdateErrorAttributesChanged = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ +}; + +/** + * Flags to specify search options to be used with ::cudaGetDriverEntryPoint + * For more details see ::cuGetProcAddress + */ +enum __device_builtin__ cudaGetDriverEntryPointFlags { + cudaEnableDefault = 0x0, /**< Default search mode for driver symbols. */ + cudaEnableLegacyStream = 0x1, /**< Search for legacy versions of driver symbols. */ + cudaEnablePerThreadDefaultStream = 0x2 /**< Search for per-thread versions of driver symbols. */ +}; + +/** + * CUDA Graph debug write options + */ +enum __device_builtin__ cudaGraphDebugDotFlags { + cudaGraphDebugDotFlagsVerbose = 1<<0, /**< Output all debug data as if every debug flag is enabled */ + cudaGraphDebugDotFlagsKernelNodeParams = 1<<2, /**< Adds cudaKernelNodeParams to output */ + cudaGraphDebugDotFlagsMemcpyNodeParams = 1<<3, /**< Adds cudaMemcpy3DParms to output */ + cudaGraphDebugDotFlagsMemsetNodeParams = 1<<4, /**< Adds cudaMemsetParams to output */ + cudaGraphDebugDotFlagsHostNodeParams = 1<<5, /**< Adds cudaHostNodeParams to output */ + cudaGraphDebugDotFlagsEventNodeParams = 1<<6, /**< Adds cudaEvent_t handle from record and wait nodes to output */ + cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7, /**< Adds cudaExternalSemaphoreSignalNodeParams values to output */ + cudaGraphDebugDotFlagsExtSemasWaitNodeParams = 1<<8, /**< Adds cudaExternalSemaphoreWaitNodeParams to output */ + cudaGraphDebugDotFlagsKernelNodeAttributes = 1<<9, /**< Adds cudaKernelNodeAttrID values to output */ + cudaGraphDebugDotFlagsHandles = 1<<10 /**< Adds node handles and every kernel function handle to output */ +}; + +/** + * Flags for instantiating a graph + */ +enum __device_builtin__ cudaGraphInstantiateFlags { + cudaGraphInstantiateFlagAutoFreeOnLaunch = 1 /**< Automatically free memory allocated in a graph before relaunching. */ + , cudaGraphInstantiateFlagUseNodePriority = 8 /**< Run the graph using the per-node priority attributes rather than the + priority of the stream it is launched into. */ +}; + +/** + * Launch attributes enum; used as id field of ::cudaLaunchAttribute + */ +typedef __device_builtin__ enum cudaLaunchAttributeID { + cudaLaunchAttributeIgnore = 0 /**< Ignored entry, for convenient composition */ + , cudaLaunchAttributeAccessPolicyWindow = 1 /**< Valid for streams, graph nodes, launches. */ + , cudaLaunchAttributeCooperative = 2 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeSynchronizationPolicy = 3 /**< Valid for streams. */ + , cudaLaunchAttributeClusterDimension = 4 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeClusterSchedulingPolicyPreference = 5 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeProgrammaticStreamSerialization = 6 /**< Valid for launches. Setting + programmaticStreamSerializationAllowed to non-0 + signals that the kernel will use programmatic + means to resolve its stream dependency, so that + the CUDA runtime should opportunistically allow + the grid's execution to overlap with the previous + kernel in the stream, if that kernel requests the + overlap. */ + , cudaLaunchAttributeProgrammaticEvent = 7 /**< Valid for launches. Event recorded through this launch + attribute is guaranteed to only trigger after all + block in the associated kernel trigger the event. A + block can trigger the event through PTX + griddepcontrol.launch_dependents. A trigger can also + be inserted at the beginning of each block's execution + if triggerAtBlockStart is set to non-0. Note that + dependents (including the CPU thread calling + cudaEventSynchronize()) are not guaranteed to observe + the release precisely when it is released. For + example, cudaEventSynchronize() may only observe the + event trigger long after the associated kernel has + completed. This recording type is primarily meant for + establishing programmatic dependency between device + tasks. The event supplied must not be an interprocess + or interop event. The event must disable timing + (i.e. created with ::cudaEventDisableTiming flag + set). */ + , cudaLaunchAttributePriority = 8 /**< Valid for graph nodes. */ +} cudaLaunchAttributeID; + +/** + * Launch attributes union; used as value field of ::cudaLaunchAttribute + */ +typedef __device_builtin__ union cudaLaunchAttributeValue { + char pad[64]; /* Pad to 64 bytes */ + struct cudaAccessPolicyWindow accessPolicyWindow; + int cooperative; + enum cudaSynchronizationPolicy syncPolicy; + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; + enum cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference; + int programmaticStreamSerializationAllowed; + struct { + cudaEvent_t event; + int flags; + int triggerAtBlockStart; + } programmaticEvent; + int priority; +} cudaLaunchAttributeValue; + +/** + * Launch attribute + */ +typedef __device_builtin__ struct cudaLaunchAttribute_st { + cudaLaunchAttributeID id; + char pad[8 - sizeof(cudaLaunchAttributeID)]; + cudaLaunchAttributeValue val; +} cudaLaunchAttribute; + +/** + * CUDA extensible launch configuration + */ +typedef __device_builtin__ struct cudaLaunchConfig_st { + dim3 gridDim; /**< Grid dimentions */ + dim3 blockDim; /**< Block dimentions */ + size_t dynamicSmemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + cudaStream_t stream; /**< Stream identifier */ + cudaLaunchAttribute *attrs; /**< nullable if numAttrs == 0 */ + unsigned int numAttrs; /**< Number of attributes populated in attrs */ +} cudaLaunchConfig_t; + +/** + * Stream Attributes + */ +#define cudaStreamAttrID cudaLaunchAttributeID +#define cudaStreamAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow +#define cudaStreamAttributeSynchronizationPolicy cudaLaunchAttributeSynchronizationPolicy + +/** + * Stream attributes union used with ::cudaStreamSetAttribute/::cudaStreamGetAttribute + */ +#define cudaStreamAttrValue cudaLaunchAttributeValue + +/** + * Graph kernel node Attributes + */ +#define cudaKernelNodeAttrID cudaLaunchAttributeID +#define cudaKernelNodeAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow +#define cudaKernelNodeAttributeCooperative cudaLaunchAttributeCooperative +#define cudaKernelNodeAttributePriority cudaLaunchAttributePriority +#define cudaKernelNodeAttributeClusterDimension cudaLaunchAttributeClusterDimension +#define cudaKernelNodeAttributeClusterSchedulingPolicyPreference cudaLaunchAttributeClusterSchedulingPolicyPreference + +/** + * Graph kernel node attributes union, used with ::cudaGraphKernelNodeSetAttribute/::cudaGraphKernelNodeGetAttribute + */ +#define cudaKernelNodeAttrValue cudaLaunchAttributeValue + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__ +#endif + +#undef __CUDA_DEPRECATED + +#endif /* !__DRIVER_TYPES_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h new file mode 100644 index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__ +#endif + +#include "crt/host_config.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__ +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h new file mode 100644 index 0000000000000000000000000000000000000000..4a7e42c6b89ba4b446d4cf3d52c8bacd74e73b0d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h @@ -0,0 +1,103 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__LIBRARY_TYPES_H__) +#define __LIBRARY_TYPES_H__ + + + +typedef enum cudaDataType_t +{ + CUDA_R_16F = 2, /* real as a half */ + CUDA_C_16F = 6, /* complex as a pair of half numbers */ + CUDA_R_16BF = 14, /* real as a nv_bfloat16 */ + CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */ + CUDA_R_32F = 0, /* real as a float */ + CUDA_C_32F = 4, /* complex as a pair of float numbers */ + CUDA_R_64F = 1, /* real as a double */ + CUDA_C_64F = 5, /* complex as a pair of double numbers */ + CUDA_R_4I = 16, /* real as a signed 4-bit int */ + CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */ + CUDA_R_4U = 18, /* real as a unsigned 4-bit int */ + CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */ + CUDA_R_8I = 3, /* real as a signed 8-bit int */ + CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */ + CUDA_R_8U = 8, /* real as a unsigned 8-bit int */ + CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */ + CUDA_R_16I = 20, /* real as a signed 16-bit int */ + CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */ + CUDA_R_16U = 22, /* real as a unsigned 16-bit int */ + CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */ + CUDA_R_32I = 10, /* real as a signed 32-bit int */ + CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */ + CUDA_R_32U = 12, /* real as a unsigned 32-bit int */ + CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */ + CUDA_R_64I = 24, /* real as a signed 64-bit int */ + CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */ + CUDA_R_64U = 26, /* real as a unsigned 64-bit int */ + CUDA_C_64U = 27, /* complex as a pair of unsigned 64-bit int numbers */ + CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */ + CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */ +} cudaDataType; + + +typedef enum libraryPropertyType_t +{ + MAJOR_VERSION, + MINOR_VERSION, + PATCH_LEVEL +} libraryPropertyType; + + +#ifndef __cplusplus +typedef enum cudaDataType_t cudaDataType_t; +typedef enum libraryPropertyType_t libraryPropertyType_t; +#endif + +#endif /* !__LIBRARY_TYPES_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/math_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ebe60b8ca83666f07464b06ff04e6fc432c31b7b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp @@ -0,0 +1,134 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.35.235 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__) +#define __SM_32_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) +{ + return __illAtomicMin(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) +{ + return __illAtomicMax(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) +{ + return __llAtomicAnd(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) +{ + return __llAtomicOr(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) +{ + return __llAtomicXor(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor(address, val); +} + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_32_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b4d5227023221116868e8446fdac23efb96e94ae --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp @@ -0,0 +1,527 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__) +#define __SM_60_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) +{ + return __dAtomicAdd(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_block(int *address, int val) +{ + return __iAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_system(int *address, int val) +{ + return __iAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_block(float *address, float val) +{ + return __fAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_system(float *address, float val) +{ + return __fAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_block(double *address, double val) +{ + return __dAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_system(double *address, double val) +{ + return __dAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_block(int *address, int val) +{ + return __iAtomicAdd_block(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_system(int *address, int val) +{ + return __iAtomicAdd_system(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_block(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_system(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_block(int *address, int val) +{ + return __iAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_system(int *address, int val) +{ + return __iAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_block(unsigned int *address, unsigned int val) +{ + return __uAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_system(unsigned int *address, unsigned int val) +{ + return __uAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_block(float *address, float val) +{ + return __fAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_system(float *address, float val) +{ + return __fAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_block(int *address, int val) +{ + return __iAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_system(int *address, int val) +{ + return __iAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_block(long long *address, long long val) +{ + return __illAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_system(long long *address, long long val) +{ + return __illAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_block(unsigned int *address, unsigned int val) +{ + return __uAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_system(unsigned int *address, unsigned int val) +{ + return __uAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_block(int *address, int val) +{ + return __iAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_system(int *address, int val) +{ + return __iAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_block(long long *address, long long val) +{ + return __illAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_system(long long *address, long long val) +{ + return __illAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_block(unsigned int *address, unsigned int val) +{ + return __uAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_system(unsigned int *address, unsigned int val) +{ + return __uAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_block(unsigned int *address, unsigned int val) +{ + return __uAtomicInc_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_system(unsigned int *address, unsigned int val) +{ + return __uAtomicInc_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_block(unsigned int *address, unsigned int val) +{ + return __uAtomicDec_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_system(unsigned int *address, unsigned int val) +{ + return __uAtomicDec_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_block(int *address, int compare, int val) +{ + return __iAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_system(int *address, int compare, int val) +{ + return __iAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_block(unsigned int *address, unsigned int compare, + unsigned int val) +{ + return __uAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_system(unsigned int *address, unsigned int compare, + unsigned int val) +{ + return __uAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_block(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) +{ + return __ullAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_system(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) +{ + return __ullAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_block(int *address, int val) +{ + return __iAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_system(int *address, int val) +{ + return __iAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_block(long long *address, long long val) +{ + return __llAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_system(long long *address, long long val) +{ + return __llAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_block(int *address, int val) +{ + return __iAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_system(int *address, int val) +{ + return __iAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_block(long long *address, long long val) +{ + return __llAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_system(long long *address, long long val) +{ + return __llAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_block(unsigned int *address, unsigned int val) +{ + return __uAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_system(unsigned int *address, unsigned int val) +{ + return __uAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_block(int *address, int val) +{ + return __iAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_system(int *address, int val) +{ + return __iAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_block(long long *address, long long val) +{ + return __llAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_system(long long *address, long long val) +{ + return __llAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_block(unsigned int *address, unsigned int val) +{ + return __uAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_system(unsigned int *address, unsigned int val) +{ + return __uAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor_system(address, val); +} + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_60_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..587a995d0ea8a697b028706e824f9437276401e6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h @@ -0,0 +1,439 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SURFACE_FUNCTIONS_H__) +#define __SURFACE_FUNCTIONS_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" +#include "cuda_surface_types.h" + +#if defined(_WIN32) +# define __DEPRECATED__ __declspec(deprecated) +#else +# define __DEPRECATED__ __attribute__((deprecated)) +#endif + + + +#ifdef __CUDA_ARCH__ +template struct __nv_surf_trait { typedef void * cast_type; }; + +template<> struct __nv_surf_trait { typedef char * cast_type; }; +template<> struct __nv_surf_trait { typedef signed char * cast_type; }; +template<> struct __nv_surf_trait { typedef unsigned char * cast_type; }; +template<> struct __nv_surf_trait { typedef char1 * cast_type; }; +template<> struct __nv_surf_trait { typedef uchar1 * cast_type; }; +template<> struct __nv_surf_trait { typedef char2 * cast_type; }; +template<> struct __nv_surf_trait { typedef uchar2 * cast_type; }; +template<> struct __nv_surf_trait { typedef char4 * cast_type; }; +template<> struct __nv_surf_trait { typedef uchar4 * cast_type; }; +template<> struct __nv_surf_trait { typedef short * cast_type; }; +template<> struct __nv_surf_trait { typedef unsigned short * cast_type; }; +template<> struct __nv_surf_trait { typedef short1 * cast_type; }; +template<> struct __nv_surf_trait { typedef ushort1 * cast_type; }; +template<> struct __nv_surf_trait { typedef short2 * cast_type; }; +template<> struct __nv_surf_trait { typedef ushort2 * cast_type; }; +template<> struct __nv_surf_trait { typedef short4 * cast_type; }; +template<> struct __nv_surf_trait { typedef ushort4 * cast_type; }; +template<> struct __nv_surf_trait { typedef int * cast_type; }; +template<> struct __nv_surf_trait { typedef unsigned int * cast_type; }; +template<> struct __nv_surf_trait { typedef int1 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint1 * cast_type; }; +template<> struct __nv_surf_trait { typedef int2 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint2 * cast_type; }; +template<> struct __nv_surf_trait { typedef int4 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint4 * cast_type; }; +template<> struct __nv_surf_trait { typedef long long * cast_type; }; +template<> struct __nv_surf_trait { typedef unsigned long long * cast_type; }; +template<> struct __nv_surf_trait { typedef longlong1 * cast_type; }; +template<> struct __nv_surf_trait { typedef ulonglong1 * cast_type; }; +template<> struct __nv_surf_trait { typedef longlong2 * cast_type; }; +template<> struct __nv_surf_trait { typedef ulonglong2 * cast_type; }; +#if !defined(__LP64__) +template<> struct __nv_surf_trait { typedef int * cast_type; }; +template<> struct __nv_surf_trait { typedef unsigned int * cast_type; }; +template<> struct __nv_surf_trait { typedef int1 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint1 * cast_type; }; +template<> struct __nv_surf_trait { typedef int2 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint2 * cast_type; }; +template<> struct __nv_surf_trait { typedef uint4 * cast_type; }; +template<> struct __nv_surf_trait { typedef int4 * cast_type; }; +#endif +template<> struct __nv_surf_trait { typedef float * cast_type; }; +template<> struct __nv_surf_trait { typedef float1 * cast_type; }; +template<> struct __nv_surf_trait { typedef float2 * cast_type; }; +template<> struct __nv_surf_trait { typedef float4 * cast_type; }; +#endif /* defined(__CUDA_ARCH__) */ + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surf1Dread(surface surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, mode); + return temp; +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf1Dread(surf, x, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surf2Dread(surface surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, y, mode); + return temp; +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf2Dread(surf, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surf3Dread(surface surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode); + return temp; +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf3Dread(surf, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x, layer, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surf1DLayeredread(surface surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode); + return temp; +#endif +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf1DLayeredread(surf, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surf2DLayeredread(surface surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode); + return temp; +#endif +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf2DLayeredread(surf, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template +static __device__ __forceinline__ void surfCubemapread(T *res, surface surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surfCubemapread(surface surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + + __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode); + return temp; +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surfCubemapread(surf, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ T surfCubemapLayeredread(surface surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode); + return temp; +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surfCubemapLayeredread(surf, x, y, layerFace, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf1Dwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, mode); +#endif /* __CUDA_ARCH__ */ +} + + +//surf2Dwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val, s, surf, x, y, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf3Dwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val, s, surf, x, y, z,mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf1DLayeredwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val, s, surf, x, layer,mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf2DLayeredwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surfCubemapwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + + +//surfCubemapLayeredwrite +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace, mode); +#endif +} + +template +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace, mode); +#endif /* __CUDA_ARCH__ */ +} + +#undef __DEPRECATED__ + + +#endif /* __cplusplus && __CUDACC__ */ +#endif /* !__SURFACE_FUNCTIONS_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..ad970aea7a04023822ba01515a10c6e83c4d7def --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h @@ -0,0 +1,739 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__) +#define __TEXTURE_FETCH_FUNCTIONS_H__ + + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" +#include "cuda_texture_types.h" + +#if defined(_WIN32) +# define __DEPRECATED__ __declspec(deprecated) +#else +# define __DEPRECATED__ __attribute__((deprecated)) +#endif + + +template +struct __nv_tex_rmet_ret { }; + +template<> struct __nv_tex_rmet_ret { typedef char type; }; +template<> struct __nv_tex_rmet_ret { typedef signed char type; }; +template<> struct __nv_tex_rmet_ret { typedef unsigned char type; }; +template<> struct __nv_tex_rmet_ret { typedef char1 type; }; +template<> struct __nv_tex_rmet_ret { typedef uchar1 type; }; +template<> struct __nv_tex_rmet_ret { typedef char2 type; }; +template<> struct __nv_tex_rmet_ret { typedef uchar2 type; }; +template<> struct __nv_tex_rmet_ret { typedef char4 type; }; +template<> struct __nv_tex_rmet_ret { typedef uchar4 type; }; + +template<> struct __nv_tex_rmet_ret { typedef short type; }; +template<> struct __nv_tex_rmet_ret { typedef unsigned short type; }; +template<> struct __nv_tex_rmet_ret { typedef short1 type; }; +template<> struct __nv_tex_rmet_ret { typedef ushort1 type; }; +template<> struct __nv_tex_rmet_ret { typedef short2 type; }; +template<> struct __nv_tex_rmet_ret { typedef ushort2 type; }; +template<> struct __nv_tex_rmet_ret { typedef short4 type; }; +template<> struct __nv_tex_rmet_ret { typedef ushort4 type; }; + +template<> struct __nv_tex_rmet_ret { typedef int type; }; +template<> struct __nv_tex_rmet_ret { typedef unsigned int type; }; +template<> struct __nv_tex_rmet_ret { typedef int1 type; }; +template<> struct __nv_tex_rmet_ret { typedef uint1 type; }; +template<> struct __nv_tex_rmet_ret { typedef int2 type; }; +template<> struct __nv_tex_rmet_ret { typedef uint2 type; }; +template<> struct __nv_tex_rmet_ret { typedef int4 type; }; +template<> struct __nv_tex_rmet_ret { typedef uint4 type; }; + +#if !defined(__LP64__) +template<> struct __nv_tex_rmet_ret { typedef long type; }; +template<> struct __nv_tex_rmet_ret { typedef unsigned long type; }; +template<> struct __nv_tex_rmet_ret { typedef long1 type; }; +template<> struct __nv_tex_rmet_ret { typedef ulong1 type; }; +template<> struct __nv_tex_rmet_ret { typedef long2 type; }; +template<> struct __nv_tex_rmet_ret { typedef ulong2 type; }; +template<> struct __nv_tex_rmet_ret { typedef long4 type; }; +template<> struct __nv_tex_rmet_ret { typedef ulong4 type; }; +#endif /* !__LP64__ */ +template<> struct __nv_tex_rmet_ret { typedef float type; }; +template<> struct __nv_tex_rmet_ret { typedef float1 type; }; +template<> struct __nv_tex_rmet_ret { typedef float2 type; }; +template<> struct __nv_tex_rmet_ret { typedef float4 type; }; + + +template struct __nv_tex_rmet_cast { typedef T* type; }; +#if !defined(__LP64__) +template<> struct __nv_tex_rmet_cast { typedef int *type; }; +template<> struct __nv_tex_rmet_cast { typedef unsigned int *type; }; +template<> struct __nv_tex_rmet_cast { typedef int1 *type; }; +template<> struct __nv_tex_rmet_cast { typedef uint1 *type; }; +template<> struct __nv_tex_rmet_cast { typedef int2 *type; }; +template<> struct __nv_tex_rmet_cast { typedef uint2 *type; }; +template<> struct __nv_tex_rmet_cast { typedef int4 *type; }; +template<> struct __nv_tex_rmet_cast { typedef uint4 *type; }; +#endif /* !__LP64__ */ + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1Dfetch(texture t, int x) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x); + return temp; +#endif +} + +template +struct __nv_tex_rmnf_ret { }; + +template <> struct __nv_tex_rmnf_ret { typedef float type; }; +template <> struct __nv_tex_rmnf_ret { typedef float type; }; +template <> struct __nv_tex_rmnf_ret { typedef float type; }; +template <> struct __nv_tex_rmnf_ret { typedef float type; }; +template <> struct __nv_tex_rmnf_ret { typedef float type; }; +template <> struct __nv_tex_rmnf_ret { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret { typedef float4 type; }; + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1Dfetch(texture t, int x) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1D +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1D(texture t, float x) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1D(texture t, float x) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex2D +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2D(texture t, float x, float y) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + + __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x, y); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2D(texture t, float x, float y) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex1DLayered +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1DLayered(texture t, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x, layer); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1DLayered(texture t, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex2DLayered +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2DLayered(texture t, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x, y, layer); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2DLayered(texture t, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3D +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex3D(texture t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x, y, z); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex3D(texture t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// texCubemap +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemap(texture t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast::type) &temp, t, x, y, z); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemap(texture t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +template +struct __nv_tex2dgather_ret { }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uchar4 type; }; + +template <> struct __nv_tex2dgather_ret { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret { typedef ushort4 type; }; + +template <> struct __nv_tex2dgather_ret { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret { typedef uint4 type; }; + +template <> struct __nv_tex2dgather_ret { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret { typedef float4 type; }; + +template +static __device__ __forceinline__ typename __nv_tex2dgather_ret::type tex2Dgather(texture t, float x, float y, int comp=0) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex2dgather_ret::type retval; + __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +template struct __nv_tex2dgather_rmnf_ret { }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret { typedef float4 type; }; + +template +static __device__ __forceinline__ typename __nv_tex2dgather_rmnf_ret::type tex2Dgather(texture t, float x, float y, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex2dgather_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex1DLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1DLod(texture t, float x, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1DLod(texture t, float x, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2DLod(texture t, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2DLod(texture t, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1DLayeredLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1DLayeredLod(texture t, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, layer, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1DLayeredLod(texture t, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLayeredLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2DLayeredLod(texture t, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, layer, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2DLayeredLod(texture t, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3DLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex3DLod(texture t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex3DLod(texture t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// texCubemapLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemapLod(texture t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemapLod(texture t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayered +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemapLayered(texture t, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, layer); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemapLayered(texture t, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayeredLod +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemapLayeredLod(texture t, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, layer, level); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemapLayeredLod(texture t, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemapGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, &dPdx, &dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemapGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t, x, y, z, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayeredGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type texCubemapLayeredGrad(texture t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, layer, &dPdx, &dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type texCubemapLayeredGrad(texture t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex1DGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1DGrad(texture t, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, dPdx, dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1DGrad(texture t, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex2DGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2DGrad(texture t, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, &dPdx, &dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2DGrad(texture t, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1DLayeredGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex1DLayeredGrad(texture t, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast::type)&temp, t, x, layer, dPdx, dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex1DLayeredGrad(texture t, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLayeredGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex2DLayeredGrad(texture t, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast::type)&temp, t, x, y, layer, &dPdx, &dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex2DLayeredGrad(texture t, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3DGrad +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret::type tex3DGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret::type temp; + __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast::type)&temp, t, x, y, z, &dPdx, &dPdy); + return temp; +#endif +} + +template +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret::type tex3DGrad(texture t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret::type retval; + __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +#undef __DEPRECATED__ + +#endif /* __cplusplus && __CUDACC__ */ + +#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0fceb1208fe38192addd97bed1f2c19b0eebd61 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h new file mode 100644 index 0000000000000000000000000000000000000000..8d3e341c8ac1735ad16deb34995ac4c0902da053 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h @@ -0,0 +1,78 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn : Neural Networks Library + +*/ + +#if !defined(CUDNN_H_) +#define CUDNN_H_ + +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" +#include "cudnn_ops_train.h" +#include "cudnn_adv_infer.h" +#include "cudnn_adv_train.h" +#include "cudnn_cnn_infer.h" +#include "cudnn_cnn_train.h" + +#include "cudnn_backend.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h new file mode 100644 index 0000000000000000000000000000000000000000..1aa47bbc71d664de3af742f1c5223b149ee5d3f3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h @@ -0,0 +1,658 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn_adv_infer : cuDNN's advanced and experimental features. + +*/ + +#if !defined(CUDNN_ADV_INFER_H_) +#define CUDNN_ADV_INFER_H_ + +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_ADV_INFER_MAJOR 8 +#define CUDNN_ADV_INFER_MINOR 7 +#define CUDNN_ADV_INFER_PATCH 0 + +#if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \ + (CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN ADV INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* BASIC RNN API */ + +typedef enum { + CUDNN_FWD_MODE_INFERENCE = 0, + CUDNN_FWD_MODE_TRAINING = 1, +} cudnnForwardMode_t; + +typedef enum { + CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */ + CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */ + CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */ + CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ +} cudnnRNNMode_t; + +typedef enum { + CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */ + CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */ + CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */ + CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */ +} cudnnRNNBiasMode_t; + +typedef enum { + CUDNN_UNIDIRECTIONAL = 0, /* single direction network */ + CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */ +} cudnnDirectionMode_t; + +typedef enum { + CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */ + CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */ +} cudnnRNNInputMode_t; + +typedef enum { + CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */ + CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */ +} cudnnRNNClipMode_t; + +typedef enum { + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */ + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */ + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */ +} cudnnRNNDataLayout_t; + +/* Legacy type for backward compatibility */ +typedef unsigned cudnnRNNPaddingMode_t; + +/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */ +#define CUDNN_RNN_PADDED_IO_DISABLED 0 +#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0) + +struct cudnnRNNStruct; +typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t; + +struct cudnnPersistentRNNPlan; +typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t; + +struct cudnnRNNDataStruct; +typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t algo, + cudnnRNNMode_t cellMode, + cudnnRNNBiasMode_t biasMode, + cudnnDirectionMode_t dirMode, + cudnnRNNInputMode_t inputMode, + cudnnDataType_t dataType, + cudnnDataType_t mathPrec, + cudnnMathType_t mathType, + int32_t inputSize, + int32_t hiddenSize, + int32_t projSize, + int32_t numLayers, + cudnnDropoutDescriptor_t dropoutDesc, + uint32_t auxFlags); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t *algo, + cudnnRNNMode_t *cellMode, + cudnnRNNBiasMode_t *biasMode, + cudnnDirectionMode_t *dirMode, + cudnnRNNInputMode_t *inputMode, + cudnnDataType_t *dataType, + cudnnDataType_t *mathPrec, + cudnnMathType_t *mathType, + int32_t *inputSize, + int32_t *hiddenSize, + int32_t *projSize, + int32_t *numLayers, + cudnnDropoutDescriptor_t *dropoutDesc, + uint32_t *auxFlags); + +/* + * mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision + * compute precision is further modified by cudnnSetRNNMatrixMathType() + * dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage + * dropout is between RNN layers, not between recurrent steps + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int hiddenSize, + const int numLayers, + cudnnDropoutDescriptor_t dropoutDesc, + cudnnRNNInputMode_t inputMode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t cellMode, + cudnnRNNAlgo_t algo, + cudnnDataType_t mathPrec); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDescriptor_v6(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + int *hiddenSize, + int *numLayers, + cudnnDropoutDescriptor_t *dropoutDesc, + cudnnRNNInputMode_t *inputMode, + cudnnDirectionMode_t *direction, + cudnnRNNMode_t *cellMode, + cudnnRNNAlgo_t *algo, + cudnnDataType_t *mathPrec); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t clipMode, + cudnnNanPropagation_t clipNanOpt, + double lclip, + double rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t *clipMode, + cudnnNanPropagation_t *clipNanOpt, + double *lclip, + double *rclip); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t clipMode, + cudnnNanPropagation_t clipNanOpt, + double lclip, + double rclip); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t *clipMode, + cudnnNanPropagation_t *clipNanOpt, + double *lclip, + double *rclip); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNProjectionLayers(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int recProjSize, + const int outProjSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNProjectionLayers(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + int *recProjSize, + int *outProjSize); + +/* Expensive. Creates the plan for the specific settings. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, + const int minibatch, + const cudnnDataType_t dataType, + cudnnPersistentRNNPlan_t *plan); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan); + +cudnnStatus_t CUDNNWINAPI +cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch); + +/* dataType in weight descriptors and input descriptors is used to describe storage */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWorkspaceSize(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fMode, + cudnnRNNDataDescriptor_t xDesc, + size_t *workSpaceSize, + size_t *reserveSpaceSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNParamsSize(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes, + cudnnDataType_t dataType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int pseudoLayer, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerMatDesc, + void **linLayerMat); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int pseudoLayer, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerBiasDesc, + void **linLayerBias); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightParams(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + int32_t pseudoLayer, + size_t weightSpaceSize, + const void *weightSpace, + int32_t linLayerID, + cudnnTensorDescriptor_t mDesc, + void **mAddr, + cudnnTensorDescriptor_t bDesc, + void **bAddr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNForwardInference(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t *yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + void *workSpace, + size_t workSpaceSizeInBytes); + +/* RNN EX API */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode); + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t dataType, + cudnnRNNDataLayout_t layout, + int maxSeqLength, + int batchSize, + int vectorSize, + const int seqLengthArray[], /* length of each sequence in the batch */ + void *paddingFill); /* symbol for filling padding position in output */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t *dataType, + cudnnRNNDataLayout_t *layout, + int *maxSeqLength, + int *batchSize, + int *vectorSize, + int arrayLengthRequested, + int seqLengthArray[], + void *paddingFill); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNForwardInferenceEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnRNNDataDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnRNNDataDescriptor_t yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */ + const void *keys, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */ + void *cAttn, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */ + void *iAttn, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */ + void *queries, /* reserved, should pass NULL */ + void *workSpace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNForward(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fwdMode, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnRNNDataDescriptor_t yDesc, + void *y, + cudnnTensorDescriptor_t hDesc, + const void *hx, + void *hy, + cudnnTensorDescriptor_t cDesc, + const void *cx, + void *cy, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +/* RNN FIND API */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t *yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + const float findIntensity, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnAlgorithmPerformance_t *perfResults, + void *workspace, + size_t workSpaceSizeInBytes); + +/* Sequence data descriptor */ + +typedef enum { + CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */ + CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */ + CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */ + CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */ +} cudnnSeqDataAxis_t; + +struct cudnnSeqDataStruct; +typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t; + +#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */ + +cudnnStatus_t CUDNNWINAPI +cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const cudnnSeqDataAxis_t axes[], + size_t seqLengthArraySize, + const int seqLengthArray[], + void *paddingFill); + +cudnnStatus_t CUDNNWINAPI +cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t *dataType, + int *nbDims, + int nbDimsRequested, + int dimA[], + cudnnSeqDataAxis_t axes[], + size_t *seqLengthArraySize, + size_t seqLengthSizeRequested, + int seqLengthArray[], + void *paddingFill); + +/* Multihead Attention */ + +/* Legacy type for backward compatibility */ +typedef unsigned cudnnAttnQueryMap_t; + +/* + * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor(). + * Use the bitwise OR operator to combine several settings listed below. Additional + * minor options can be added here w/o changing or introducing new API functions. + */ +#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */ +#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */ +#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */ +#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */ + +struct cudnnAttnStruct; +typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned attnMode, + int nHeads, + double smScaler, + cudnnDataType_t dataType, + cudnnDataType_t computePrec, + cudnnMathType_t mathType, + cudnnDropoutDescriptor_t attnDropoutDesc, + cudnnDropoutDescriptor_t postDropoutDesc, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoMaxSeqLength, + int kvMaxSeqLength, + int maxBatchSize, + int maxBeamSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned *attnMode, + int *nHeads, + double *smScaler, + cudnnDataType_t *dataType, + cudnnDataType_t *computePrec, + cudnnMathType_t *mathType, + cudnnDropoutDescriptor_t *attnDropoutDesc, + cudnnDropoutDescriptor_t *postDropoutDesc, + int *qSize, + int *kSize, + int *vSize, + int *qProjSize, + int *kProjSize, + int *vProjSize, + int *oProjSize, + int *qoMaxSeqLength, + int *kvMaxSeqLength, + int *maxBatchSize, + int *maxBeamSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + size_t *weightSizeInBytes, + size_t *workSpaceSizeInBytes, + size_t *reserveSpaceSizeInBytes); + +typedef enum { + CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */ + CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */ + CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */ + CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */ + CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */ + CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */ + CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */ + CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */ +} cudnnMultiHeadAttnWeightKind_t; + +#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnMultiHeadAttnWeightKind_t wKind, + size_t weightSizeInBytes, + const void *weights, + cudnnTensorDescriptor_t wDesc, + void **wAddr); + +cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnForward(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + int currIdx, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsQO[], + const int devSeqLengthsKV[], + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const void *residuals, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t oDesc, + void *out, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnAdvInferVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_ADV_INFER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h new file mode 100644 index 0000000000000000000000000000000000000000..7b0197794116f93c1304639cf1de950e1db686e4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h @@ -0,0 +1,6840 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ +#if !defined(CUSPARSE_H_) +#define CUSPARSE_H_ + +#include +#include +#include +#include +#include +#include + +//############################################################################## +//# CUSPARSE VERSION INFORMATION +//############################################################################## + +#define CUSPARSE_VER_MAJOR 11 +#define CUSPARSE_VER_MINOR 7 +#define CUSPARSE_VER_PATCH 5 +#define CUSPARSE_VER_BUILD 86 +#define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + \ + CUSPARSE_VER_MINOR * 100 + \ + CUSPARSE_VER_PATCH) + +// ############################################################################# +// # BASIC MACROS +// ############################################################################# + +#if !defined(CUSPARSEAPI) +# if defined(_WIN32) +# define CUSPARSEAPI __stdcall +# else +# define CUSPARSEAPI +# endif +#endif + +//------------------------------------------------------------------------------ + +#if !defined(_MSC_VER) +# define CUSPARSE_CPP_VERSION __cplusplus +#elif _MSC_FULL_VER >= 190024210 // Visual Studio 2015 Update 3 +# define CUSPARSE_CPP_VERSION _MSVC_LANG +#else +# define CUSPARSE_CPP_VERSION 0 +#endif + +// ############################################################################# +// # CUSPARSE_DEPRECATED MACRO +// ############################################################################# + +#if !defined(DISABLE_CUSPARSE_DEPRECATED) + +# if CUSPARSE_CPP_VERSION >= 201402L + +# define CUSPARSE_DEPRECATED(new_func) \ + [[deprecated("please use " #new_func " instead")]] + +# elif defined(_MSC_VER) + +# define CUSPARSE_DEPRECATED(new_func) \ + __declspec(deprecated("please use " #new_func " instead")) + +# elif defined(__INTEL_COMPILER) || defined(__clang__) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) + +# define CUSPARSE_DEPRECATED(new_func) \ + __attribute__((deprecated("please use " #new_func " instead"))) + +# elif defined(__GNUC__) || defined(__xlc__) + +# define CUSPARSE_DEPRECATED(new_func) \ + __attribute__((deprecated)) + +# else + +# define CUSPARSE_DEPRECATED(new_func) + +# endif // defined(__cplusplus) && __cplusplus >= 201402L +//------------------------------------------------------------------------------ + +# if CUSPARSE_CPP_VERSION >= 201703L + +# define CUSPARSE_DEPRECATED_ENUM(new_enum) \ + [[deprecated("please use " #new_enum " instead")]] + +# elif defined(__clang__) || \ + (defined(__GNUC__) && __GNUC__ >= 6 && !defined(__PGI)) + +# define CUSPARSE_DEPRECATED_ENUM(new_enum) \ + __attribute__((deprecated("please use " #new_enum " instead"))) + +# else + +# define CUSPARSE_DEPRECATED_ENUM(new_enum) + +# endif // defined(__cplusplus) && __cplusplus >= 201402L + +#else // defined(DISABLE_CUSPARSE_DEPRECATED) + +# define CUSPARSE_DEPRECATED(new_func) +# define CUSPARSE_DEPRECATED_ENUM(new_enum) + +#endif // !defined(DISABLE_CUSPARSE_DEPRECATED) + +#undef CUSPARSE_CPP_VERSION + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) +extern "C" { +#endif // defined(__cplusplus) + +//############################################################################## +//# OPAQUE DATA STRUCTURES +//############################################################################## + +struct cusparseContext; +typedef struct cusparseContext* cusparseHandle_t; + +struct cusparseMatDescr; +typedef struct cusparseMatDescr* cusparseMatDescr_t; + +struct csrsv2Info; +typedef struct csrsv2Info* csrsv2Info_t; + +struct csrsm2Info; +typedef struct csrsm2Info* csrsm2Info_t; + +struct bsrsv2Info; +typedef struct bsrsv2Info* bsrsv2Info_t; + +struct bsrsm2Info; +typedef struct bsrsm2Info* bsrsm2Info_t; + +struct csric02Info; +typedef struct csric02Info* csric02Info_t; + +struct bsric02Info; +typedef struct bsric02Info* bsric02Info_t; + +struct csrilu02Info; +typedef struct csrilu02Info* csrilu02Info_t; + +struct bsrilu02Info; +typedef struct bsrilu02Info* bsrilu02Info_t; + +struct csrgemm2Info; +typedef struct csrgemm2Info* csrgemm2Info_t; + +struct csru2csrInfo; +typedef struct csru2csrInfo* csru2csrInfo_t; + +struct cusparseColorInfo; +typedef struct cusparseColorInfo* cusparseColorInfo_t; + +struct pruneInfo; +typedef struct pruneInfo* pruneInfo_t; + +//############################################################################## +//# ENUMERATORS +//############################################################################## + +typedef enum { + CUSPARSE_STATUS_SUCCESS = 0, + CUSPARSE_STATUS_NOT_INITIALIZED = 1, + CUSPARSE_STATUS_ALLOC_FAILED = 2, + CUSPARSE_STATUS_INVALID_VALUE = 3, + CUSPARSE_STATUS_ARCH_MISMATCH = 4, + CUSPARSE_STATUS_MAPPING_ERROR = 5, + CUSPARSE_STATUS_EXECUTION_FAILED = 6, + CUSPARSE_STATUS_INTERNAL_ERROR = 7, + CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, + CUSPARSE_STATUS_ZERO_PIVOT = 9, + CUSPARSE_STATUS_NOT_SUPPORTED = 10, + CUSPARSE_STATUS_INSUFFICIENT_RESOURCES = 11 +} cusparseStatus_t; + +typedef enum { + CUSPARSE_POINTER_MODE_HOST = 0, + CUSPARSE_POINTER_MODE_DEVICE = 1 +} cusparsePointerMode_t; + +typedef enum { + CUSPARSE_ACTION_SYMBOLIC = 0, + CUSPARSE_ACTION_NUMERIC = 1 +} cusparseAction_t; + +typedef enum { + CUSPARSE_MATRIX_TYPE_GENERAL = 0, + CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, + CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, + CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 +} cusparseMatrixType_t; + +typedef enum { + CUSPARSE_FILL_MODE_LOWER = 0, + CUSPARSE_FILL_MODE_UPPER = 1 +} cusparseFillMode_t; + +typedef enum { + CUSPARSE_DIAG_TYPE_NON_UNIT = 0, + CUSPARSE_DIAG_TYPE_UNIT = 1 +} cusparseDiagType_t; + +typedef enum { + CUSPARSE_INDEX_BASE_ZERO = 0, + CUSPARSE_INDEX_BASE_ONE = 1 +} cusparseIndexBase_t; + +typedef enum { + CUSPARSE_OPERATION_NON_TRANSPOSE = 0, + CUSPARSE_OPERATION_TRANSPOSE = 1, + CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 +} cusparseOperation_t; + +typedef enum { + CUSPARSE_DIRECTION_ROW = 0, + CUSPARSE_DIRECTION_COLUMN = 1 +} cusparseDirection_t; + +typedef enum { + CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0, + CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 +} cusparseSolvePolicy_t; + +typedef enum { + CUSPARSE_COLOR_ALG0 = 0, // default + CUSPARSE_COLOR_ALG1 = 1 +} cusparseColorAlg_t; + +typedef enum { + CUSPARSE_ALG_MERGE_PATH // merge path alias +} cusparseAlgMode_t; + +//############################################################################## +//# INITIALIZATION AND MANAGEMENT ROUTINES +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseCreate(cusparseHandle_t* handle); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroy(cusparseHandle_t handle); + +cusparseStatus_t CUSPARSEAPI +cusparseGetVersion(cusparseHandle_t handle, + int* version); + +cusparseStatus_t CUSPARSEAPI +cusparseGetProperty(libraryPropertyType type, + int* value); + +const char* CUSPARSEAPI +cusparseGetErrorName(cusparseStatus_t status); + +const char* CUSPARSEAPI +cusparseGetErrorString(cusparseStatus_t status); + +cusparseStatus_t CUSPARSEAPI +cusparseSetStream(cusparseHandle_t handle, + cudaStream_t streamId); + +cusparseStatus_t CUSPARSEAPI +cusparseGetStream(cusparseHandle_t handle, + cudaStream_t* streamId); + +cusparseStatus_t CUSPARSEAPI +cusparseGetPointerMode(cusparseHandle_t handle, + cusparsePointerMode_t* mode); + +cusparseStatus_t CUSPARSEAPI +cusparseSetPointerMode(cusparseHandle_t handle, + cusparsePointerMode_t mode); + +//############################################################################## +//# LOGGING APIs +//############################################################################## + +typedef void (*cusparseLoggerCallback_t)(int logLevel, + const char* functionName, + const char* message); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerSetCallback(cusparseLoggerCallback_t callback); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerSetFile(FILE* file); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerOpenFile(const char* logFile); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerSetLevel(int level); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerSetMask(int mask); + +cusparseStatus_t CUSPARSEAPI +cusparseLoggerForceDisable(void); + +//############################################################################## +//# HELPER ROUTINES +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseCreateMatDescr(cusparseMatDescr_t* descrA); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyMatDescr(cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI +cusparseCopyMatDescr(cusparseMatDescr_t dest, + const cusparseMatDescr_t src); + +cusparseStatus_t CUSPARSEAPI +cusparseSetMatType(cusparseMatDescr_t descrA, + cusparseMatrixType_t type); + +cusparseMatrixType_t CUSPARSEAPI +cusparseGetMatType(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI +cusparseSetMatFillMode(cusparseMatDescr_t descrA, + cusparseFillMode_t fillMode); + +cusparseFillMode_t CUSPARSEAPI +cusparseGetMatFillMode(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI +cusparseSetMatDiagType(cusparseMatDescr_t descrA, + cusparseDiagType_t diagType); + +cusparseDiagType_t CUSPARSEAPI +cusparseGetMatDiagType(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI +cusparseSetMatIndexBase(cusparseMatDescr_t descrA, + cusparseIndexBase_t base); + +cusparseIndexBase_t CUSPARSEAPI +cusparseGetMatIndexBase(const cusparseMatDescr_t descrA); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsrsv2Info(csrsv2Info_t* info); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsrsv2Info(csrsv2Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsric02Info(csric02Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsric02Info(csric02Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateBsric02Info(bsric02Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyBsric02Info(bsric02Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsrilu02Info(csrilu02Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsrilu02Info(csrilu02Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateBsrilu02Info(bsrilu02Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyBsrilu02Info(bsrilu02Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateBsrsv2Info(bsrsv2Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyBsrsv2Info(bsrsv2Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateBsrsm2Info(bsrsm2Info_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyBsrsm2Info(bsrsm2Info_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsru2csrInfo(csru2csrInfo_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsru2csrInfo(csru2csrInfo_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateColorInfo(cusparseColorInfo_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyColorInfo(cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseSetColorAlgs(cusparseColorInfo_t info, + cusparseColorAlg_t alg); + +cusparseStatus_t CUSPARSEAPI +cusparseGetColorAlgs(cusparseColorInfo_t info, + cusparseColorAlg_t* alg); + +cusparseStatus_t CUSPARSEAPI +cusparseCreatePruneInfo(pruneInfo_t* info); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyPruneInfo(pruneInfo_t info); + +//############################################################################## +//# SPARSE LEVEL 1 ROUTINES +//############################################################################## + +CUSPARSE_DEPRECATED(cusparseAxpby) +cusparseStatus_t CUSPARSEAPI +cusparseSaxpyi(cusparseHandle_t handle, + int nnz, + const float* alpha, + const float* xVal, + const int* xInd, + float* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseAxpby) +cusparseStatus_t CUSPARSEAPI +cusparseDaxpyi(cusparseHandle_t handle, + int nnz, + const double* alpha, + const double* xVal, + const int* xInd, + double* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseAxpby) +cusparseStatus_t CUSPARSEAPI +cusparseCaxpyi(cusparseHandle_t handle, + int nnz, + const cuComplex* alpha, + const cuComplex* xVal, + const int* xInd, + cuComplex* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseAxpby) +cusparseStatus_t CUSPARSEAPI +cusparseZaxpyi(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex* alpha, + const cuDoubleComplex* xVal, + const int* xInd, + cuDoubleComplex* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseSgthr(cusparseHandle_t handle, + int nnz, + const float* y, + float* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseDgthr(cusparseHandle_t handle, + int nnz, + const double* y, + double* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseCgthr(cusparseHandle_t handle, + int nnz, + const cuComplex* y, + cuComplex* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseZgthr(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex* y, + cuDoubleComplex* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseSgthrz(cusparseHandle_t handle, + int nnz, + float* y, + float* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseDgthrz(cusparseHandle_t handle, + int nnz, + double* y, + double* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseCgthrz(cusparseHandle_t handle, + int nnz, + cuComplex* y, + cuComplex* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseGather) +cusparseStatus_t CUSPARSEAPI +cusparseZgthrz(cusparseHandle_t handle, + int nnz, + cuDoubleComplex* y, + cuDoubleComplex* xVal, + const int* xInd, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseScatter) +cusparseStatus_t CUSPARSEAPI +cusparseSsctr(cusparseHandle_t handle, + int nnz, + const float* xVal, + const int* xInd, + float* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseScatter) +cusparseStatus_t CUSPARSEAPI +cusparseDsctr(cusparseHandle_t handle, + int nnz, + const double* xVal, + const int* xInd, + double* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseScatter) +cusparseStatus_t CUSPARSEAPI +cusparseCsctr(cusparseHandle_t handle, + int nnz, + const cuComplex* xVal, + const int* xInd, + cuComplex* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseScatter) +cusparseStatus_t CUSPARSEAPI +cusparseZsctr(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex* xVal, + const int* xInd, + cuDoubleComplex* y, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseRot) +cusparseStatus_t CUSPARSEAPI +cusparseSroti(cusparseHandle_t handle, + int nnz, + float* xVal, + const int* xInd, + float* y, + const float* c, + const float* s, + cusparseIndexBase_t idxBase); + +CUSPARSE_DEPRECATED(cusparseRot) +cusparseStatus_t CUSPARSEAPI +cusparseDroti(cusparseHandle_t handle, + int nnz, + double* xVal, + const int* xInd, + double* y, + const double* c, + const double* s, + cusparseIndexBase_t idxBase); + +//############################################################################## +//# SPARSE LEVEL 2 ROUTINES +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseSgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const float* alpha, + const float* A, + int lda, + int nnz, + const float* xVal, + const int* xInd, + const float* beta, + float* y, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgemvi_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const double* alpha, + const double* A, + int lda, + int nnz, + const double* xVal, + const int* xInd, + const double* beta, + double* y, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgemvi_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuComplex* alpha, + const cuComplex* A, + int lda, + int nnz, + const cuComplex* xVal, + const int* xInd, + const cuComplex* beta, + cuComplex* y, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgemvi_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + int lda, + int nnz, + const cuDoubleComplex* xVal, + const int* xInd, + const cuDoubleComplex* beta, + cuDoubleComplex* y, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgemvi_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpMV) +cusparseStatus_t CUSPARSEAPI +cusparseCsrmvEx_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const void* alpha, + cudaDataType alphatype, + const cusparseMatDescr_t descrA, + const void* csrValA, + cudaDataType csrValAtype, + const int* csrRowPtrA, + const int* csrColIndA, + const void* x, + cudaDataType xtype, + const void* beta, + cudaDataType betatype, + void* y, + cudaDataType ytype, + cudaDataType executiontype, + size_t* bufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpMV) +cusparseStatus_t CUSPARSEAPI +cusparseCsrmvEx(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const void* alpha, + cudaDataType alphatype, + const cusparseMatDescr_t descrA, + const void* csrValA, + cudaDataType csrValAtype, + const int* csrRowPtrA, + const int* csrColIndA, + const void* x, + cudaDataType xtype, + const void* beta, + cudaDataType betatype, + void* y, + cudaDataType ytype, + cudaDataType executiontype, + void* buffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const float* x, + const float* beta, + float* y); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const double* x, + const double* beta, + double* y); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cuComplex* x, + const cuComplex* beta, + cuComplex* y); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cuDoubleComplex* x, + const cuDoubleComplex* beta, + cuDoubleComplex* y); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedMaskPtrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedEndPtrA, + const int* bsrSortedColIndA, + int blockDim, + const float* x, + const float* beta, + float* y); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedMaskPtrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedEndPtrA, + const int* bsrSortedColIndA, + int blockDim, + const double* x, + const double* beta, + double* y); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedMaskPtrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedEndPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cuComplex* x, + const cuComplex* beta, + cuComplex* y); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedMaskPtrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedEndPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cuDoubleComplex* x, + const cuDoubleComplex* beta, + cuDoubleComplex* y); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle, + csrsv2Info_t info, + int* position); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + int* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + int* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + int* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + int* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + const float* f, + float* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + const double* f, + double* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + const cuComplex* f, + cuComplex* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSV) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrsv2Info_t info, + const cuDoubleComplex* f, + cuDoubleComplex* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle, + bsrsv2Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const float* f, + float* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const double* f, + double* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const cuComplex* f, + cuComplex* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const cuDoubleComplex* f, + cuDoubleComplex* x, + cusparseSolvePolicy_t policy, + void* pBuffer); + +//############################################################################## +//# SPARSE LEVEL 3 ROUTINES +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + const int blockSize, + const float* B, + const int ldb, + const float* beta, + float* C, + int ldc); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + const int blockSize, + const double* B, + const int ldb, + const double* beta, + double* C, + int ldc); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + const int blockSize, + const cuComplex* B, + const int ldb, + const cuComplex* beta, + cuComplex* C, + int ldc); + +cusparseStatus_t CUSPARSEAPI + cusparseZbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + const int blockSize, + const cuDoubleComplex* B, + const int ldb, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + int ldc); + +CUSPARSE_DEPRECATED(cusparseSpMM) +cusparseStatus_t CUSPARSEAPI +cusparseSgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float* alpha, + const float* A, + int lda, + const float* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const float* beta, + float* C, + int ldc); + +CUSPARSE_DEPRECATED(cusparseSpMM) +cusparseStatus_t CUSPARSEAPI +cusparseDgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double* alpha, + const double* A, + int lda, + const double* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const double* beta, + double* C, + int ldc); + +CUSPARSE_DEPRECATED(cusparseSpMM) +cusparseStatus_t CUSPARSEAPI +cusparseCgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const cuComplex* alpha, + const cuComplex* A, + int lda, + const cuComplex* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const cuComplex* beta, + cuComplex* C, + int ldc); + +CUSPARSE_DEPRECATED(cusparseSpMM) +cusparseStatus_t CUSPARSEAPI +cusparseZgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const cuDoubleComplex* alpha, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const cuDoubleComplex* beta, + cuDoubleComplex* C, + int ldc); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsrsm2Info(csrsm2Info_t* info); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsrsm2Info(csrsm2Info_t info); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle, + csrsm2Info_t info, + int* position); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsm2_bufferSizeExt(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsm2_bufferSizeExt(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsm2_bufferSizeExt(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsm2_bufferSizeExt(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuDoubleComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + size_t* pBufferSize); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsm2_analysis(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsm2_analysis(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsm2_analysis(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsm2_analysis(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuDoubleComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseScsrsm2_solve(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrsm2_solve(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + double* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrsm2_solve(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + cuComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpSM) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrsm2_solve(cusparseHandle_t handle, + int algo, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int nrhs, + int nnz, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + cuDoubleComplex* B, + int ldb, + csrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle, + bsrsm2Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const float* B, + int ldb, + float* X, + int ldx, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const double* B, + int ldb, + double* X, + int ldx, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const cuComplex* B, + int ldb, + cuComplex* X, + int ldx, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex* X, + int ldx, + cusparseSolvePolicy_t policy, + void* pBuffer); + +//############################################################################## +//# PRECONDITIONERS +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseScsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double* tol, + float* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double* tol, + double* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double* tol, + cuComplex* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double* tol, + cuDoubleComplex* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle, + csrilu02Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double* tol, + float* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double* tol, + double* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double* tol, + cuComplex* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double* tol, + cuDoubleComplex* boost_val); + +cusparseStatus_t CUSPARSEAPI +cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle, + bsrilu02Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsric02_zeroPivot(cusparseHandle_t handle, + csric02Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseScsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseScsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedVal, + const int* csrSortedRowPtr, + const int* csrSortedColInd, + csric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseScsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrSortedValA_valM, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXbsric02_zeroPivot(cusparseHandle_t handle, + bsric02Info_t info, + int* position); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pInputBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pInputBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pInputBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pInputBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* + bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const float* dl, + const float* d, + const float* du, + const float* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const double* dl, + const double* d, + const double* du, + const double* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + const cuComplex* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + const cuDoubleComplex* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2(cusparseHandle_t handle, + int m, + int n, + const float* dl, + const float* d, + const float* du, + float* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2(cusparseHandle_t handle, + int m, + int n, + const double* dl, + const double* d, + const double* du, + double* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2(cusparseHandle_t handle, + int m, + int n, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + cuComplex* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + cuDoubleComplex* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const float* dl, + const float* d, + const float* du, + const float* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const double* dl, + const double* d, + const double* du, + const double* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + const cuComplex* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + const cuDoubleComplex* B, + int ldb, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2_nopivot(cusparseHandle_t handle, + int m, + int n, + const float* dl, + const float* d, + const float* du, + float* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2_nopivot(cusparseHandle_t handle, + int m, + int n, + const double* dl, + const double* d, + const double* du, + double* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2_nopivot(cusparseHandle_t handle, + int m, + int n, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + cuComplex* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2_nopivot(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + cuDoubleComplex* B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle, + int m, + const float* dl, + const float* d, + const float* du, + const float* x, + int batchCount, + int batchStride, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle, + int m, + const double* dl, + const double* d, + const double* du, + const double* x, + int batchCount, + int batchStride, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle, + int m, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + const cuComplex* x, + int batchCount, + int batchStride, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle, + int m, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + const cuDoubleComplex* x, + int batchCount, + int batchStride, + size_t* bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsv2StridedBatch(cusparseHandle_t handle, + int m, + const float* dl, + const float* d, + const float* du, + float* x, + int batchCount, + int batchStride, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsv2StridedBatch(cusparseHandle_t handle, + int m, + const double* dl, + const double* d, + const double* du, + double* x, + int batchCount, + int batchStride, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsv2StridedBatch(cusparseHandle_t handle, + int m, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + cuComplex* x, + int batchCount, + int batchStride, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsv2StridedBatch(cusparseHandle_t handle, + int m, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + cuDoubleComplex* x, + int batchCount, + int batchStride, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const float* dl, + const float* d, + const float* du, + const float* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const double* dl, + const double* d, + const double* du, + const double* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + const cuComplex* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + const cuDoubleComplex* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgtsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + float* dl, + float* d, + float* du, + float* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgtsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + double* dl, + double* d, + double* du, + double* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgtsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + cuComplex* dl, + cuComplex* d, + cuComplex* du, + cuComplex* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgtsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + cuDoubleComplex* dl, + cuDoubleComplex* d, + cuDoubleComplex* du, + cuDoubleComplex* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const float* ds, + const float* dl, + const float* d, + const float* du, + const float* dw, + const float* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const double* ds, + const double* dl, + const double* d, + const double* du, + const double* dw, + const double* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const cuComplex* ds, + const cuComplex* dl, + const cuComplex* d, + const cuComplex* du, + const cuComplex* dw, + const cuComplex* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle, + int algo, + int m, + const cuDoubleComplex* ds, + const cuDoubleComplex* dl, + const cuDoubleComplex* d, + const cuDoubleComplex* du, + const cuDoubleComplex* dw, + const cuDoubleComplex* x, + int batchCount, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgpsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + float* ds, + float* dl, + float* d, + float* du, + float* dw, + float* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgpsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + double* ds, + double* dl, + double* d, + double* du, + double* dw, + double* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgpsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + cuComplex* ds, + cuComplex* dl, + cuComplex* d, + cuComplex* du, + cuComplex* dw, + cuComplex* x, + int batchCount, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgpsvInterleavedBatch(cusparseHandle_t handle, + int algo, + int m, + cuDoubleComplex* ds, + cuDoubleComplex* dl, + cuDoubleComplex* d, + cuDoubleComplex* du, + cuDoubleComplex* dw, + cuDoubleComplex* x, + int batchCount, + void* pBuffer); + +//############################################################################## +//# EXTRA ROUTINES +//############################################################################## + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsrgemm2Info(csrgemm2Info_t* info); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseDestroyCsrgemm2Info(csrgemm2Info_t info); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseScsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cuComplex* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cuDoubleComplex* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseXcsrgemm2Nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrD, + int nnzD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + const csrgemm2Info_t info, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseScsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float* csrSortedValD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + const csrgemm2Info_t info, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseDcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double* csrSortedValD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + const csrgemm2Info_t info, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseCcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cuComplex* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const cuComplex* csrSortedValD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + const cusparseMatDescr_t descrC, + cuComplex* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + const csrgemm2Info_t info, + void* pBuffer); + +CUSPARSE_DEPRECATED(cusparseSpGEMM) +cusparseStatus_t CUSPARSEAPI +cusparseZcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cuDoubleComplex* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const cuDoubleComplex* csrSortedValD, + const int* csrSortedRowPtrD, + const int* csrSortedColIndD, + const cusparseMatDescr_t descrC, + cuDoubleComplex* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + const csrgemm2Info_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrgeam2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + const float* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrgeam2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + const double* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrgeam2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuComplex* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + const cuComplex* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrgeam2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuDoubleComplex* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + const cuDoubleComplex* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsrgeam2Nnz(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + int nnzA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + void* workspace); + +cusparseStatus_t CUSPARSEAPI +cusparseScsrgeam2(cusparseHandle_t handle, + int m, + int n, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrgeam2(cusparseHandle_t handle, + int m, + int n, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrgeam2(cusparseHandle_t handle, + int m, + int n, + const cuComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuComplex* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrgeam2(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cuDoubleComplex* beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex* csrSortedValB, + const int* csrSortedRowPtrB, + const int* csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuDoubleComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +//############################################################################## +//# SPARSE MATRIX REORDERING +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseScsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* fractionToColor, + int* ncolors, + int* coloring, + int* reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* fractionToColor, + int* ncolors, + int* coloring, + int* reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* fractionToColor, + int* ncolors, + int* coloring, + int* reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* fractionToColor, + int* ncolors, + int* coloring, + int* reordering, + const cusparseColorInfo_t info); + +//############################################################################## +//# SPARSE FORMAT CONVERSION +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseSnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* A, + int lda, + int* nnzPerRowCol, + int* nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI +cusparseDnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* A, + int lda, + int* nnzPerRowCol, + int* nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI +cusparseCnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* A, + int lda, + int* nnzPerRowCol, + int* nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI +cusparseZnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* A, + int lda, + int* nnzPerRowCol, + int* nnzTotalDevHostPtr); + +//############################################################################## +//# SPARSE FORMAT CONVERSION +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseSnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + int* nnzPerRow, + int* nnzC, + float tol); + +cusparseStatus_t CUSPARSEAPI +cusparseDnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + int* nnzPerRow, + int* nnzC, + double tol); + +cusparseStatus_t CUSPARSEAPI +cusparseCnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + int* nnzPerRow, + int* nnzC, + cuComplex tol); + +cusparseStatus_t CUSPARSEAPI +cusparseZnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + int* nnzPerRow, + int* nnzC, + cuDoubleComplex tol); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2csr_compress(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedColIndA, + const int* csrSortedRowPtrA, + int nnzA, + const int* nnzPerRow, + float* csrSortedValC, + int* csrSortedColIndC, + int* csrSortedRowPtrC, + float tol); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2csr_compress(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedColIndA, + const int* csrSortedRowPtrA, + int nnzA, + const int* nnzPerRow, + double* csrSortedValC, + int* csrSortedColIndC, + int* csrSortedRowPtrC, + double tol); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2csr_compress(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedColIndA, + const int* csrSortedRowPtrA, + int nnzA, + const int* nnzPerRow, + cuComplex* csrSortedValC, + int* csrSortedColIndC, + int* csrSortedRowPtrC, + cuComplex tol); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2csr_compress(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedColIndA, + const int* csrSortedRowPtrA, + int nnzA, + const int* nnzPerRow, + cuDoubleComplex* csrSortedValC, + int* csrSortedColIndC, + int* csrSortedRowPtrC, + cuDoubleComplex tol); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseSdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* A, + int lda, + const int* nnzPerRow, + float* csrSortedValA, + int* csrSortedRowPtrA, + int* csrSortedColIndA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseDdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* A, + int lda, + const int* nnzPerRow, + double* csrSortedValA, + int* csrSortedRowPtrA, + int* csrSortedColIndA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseCdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* A, + int lda, + const int* nnzPerRow, + cuComplex* csrSortedValA, + int* csrSortedRowPtrA, + int* csrSortedColIndA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseZdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* A, + int lda, + const int* nnzPerRow, + cuDoubleComplex* csrSortedValA, + int* csrSortedRowPtrA, + int* csrSortedColIndA); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseScsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + double* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + cuComplex* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + cuDoubleComplex* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseSdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* A, + int lda, + const int* nnzPerCol, + float* cscSortedValA, + int* cscSortedRowIndA, + int* cscSortedColPtrA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseDdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* A, + int lda, + const int* nnzPerCol, + double* cscSortedValA, + int* cscSortedRowIndA, + int* cscSortedColPtrA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseCdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* A, + int lda, + const int* nnzPerCol, + cuComplex* cscSortedValA, + int* cscSortedRowIndA, + int* cscSortedColPtrA); + +CUSPARSE_DEPRECATED(cusparseDenseToSparse) +cusparseStatus_t CUSPARSEAPI +cusparseZdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* A, + int lda, + const int* nnzPerCol, + cuDoubleComplex* cscSortedValA, + int* cscSortedRowIndA, + int* cscSortedColPtrA); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseScsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* cscSortedValA, + const int* cscSortedRowIndA, + const int* cscSortedColPtrA, + float* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseDcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* cscSortedValA, + const int* cscSortedRowIndA, + const int* cscSortedColPtrA, + double* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseCcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* cscSortedValA, + const int* cscSortedRowIndA, + const int* cscSortedColPtrA, + cuComplex* A, + int lda); + +CUSPARSE_DEPRECATED(cusparseSparseToDense) +cusparseStatus_t CUSPARSEAPI +cusparseZcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* cscSortedValA, + const int* cscSortedRowIndA, + const int* cscSortedColPtrA, + cuDoubleComplex* A, + int lda); + +cusparseStatus_t CUSPARSEAPI +cusparseXcoo2csr(cusparseHandle_t handle, + const int* cooRowInd, + int nnz, + int m, + int* csrSortedRowPtr, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsr2coo(cusparseHandle_t handle, + const int* csrSortedRowPtr, + int nnz, + int m, + int* cooRowInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsr2bsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + int* bsrSortedRowPtrC, + int* nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + float* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + double* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseSbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseDbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseCbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseZbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + float* bscVal, + int* bscRowInd, + int* bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + double* bscVal, + int* bscRowInd, + int* bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + cuComplex* bscVal, + int* bscRowInd, + int* bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex* bsrSortedVal, + const int* bsrSortedRowPtr, + const int* bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + cuDoubleComplex* bscVal, + int* bscRowInd, + int* bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + cuComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex* csrSortedValC, + int* csrSortedRowPtrC, + int* csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsr2gebsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrC, + int* bsrSortedRowPtrC, + int rowBlockDim, + int colBlockDim, + int* nnzTotalDevHostPtr, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrC, + float* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrC, + double* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrC, + cuComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const cusparseMatDescr_t descrC, + cuDoubleComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t* pBufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseXgebsr2gebsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + int* bsrSortedRowPtrC, + int rowBlockDimC, + int colBlockDimC, + int* nnzTotalDevHostPtr, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + float* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + double* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + cuComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex* bsrSortedValA, + const int* bsrSortedRowPtrA, + const int* bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + cuDoubleComplex* bsrSortedValC, + int* bsrSortedRowPtrC, + int* bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void* pBuffer); + +//############################################################################## +//# SPARSE MATRIX SORTING +//############################################################################## + +cusparseStatus_t CUSPARSEAPI +cusparseCreateIdentityPermutation(cusparseHandle_t handle, + int n, + int* p); + +cusparseStatus_t CUSPARSEAPI +cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cooRowsA, + const int* cooColsA, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseXcoosortByRow(cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRowsA, + int* cooColsA, + int* P, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXcoosortByColumn(cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRowsA, + int* cooColsA, + int* P, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* csrRowPtrA, + const int* csrColIndA, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseXcsrsort(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + const int* csrRowPtrA, + int* csrColIndA, + int* P, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cscColPtrA, + const int* cscRowIndA, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseXcscsort(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + const int* cscColPtrA, + int* cscRowIndA, + int* P, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + float* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + double* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + cuComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + cuDoubleComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseScsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + float* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + double* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseScsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + float* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + double* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseZcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex* csrVal, + const int* csrRowPtr, + int* csrColInd, + csru2csrInfo_t info, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + const __half* threshold, + const cusparseMatDescr_t descrC, + const __half* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + const float* threshold, + const cusparseMatDescr_t descrC, + const float* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + const double* threshold, + const cusparseMatDescr_t descrC, + const double* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csrNnz(cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + const __half* threshold, + const cusparseMatDescr_t descrC, + int* csrRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csrNnz(cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + const float* threshold, + const cusparseMatDescr_t descrC, + int* csrRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csrNnz(cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + const double* threshold, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csr(cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + const __half* threshold, + const cusparseMatDescr_t descrC, + __half* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csr(cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + const float* threshold, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csr(cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + const double* threshold, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const __half* threshold, + const cusparseMatDescr_t descrC, + const __half* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* threshold, + const cusparseMatDescr_t descrC, + const float* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneCsr2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* threshold, + const cusparseMatDescr_t descrC, + const double* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + size_t* pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csrNnz(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const __half* threshold, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csrNnz(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* threshold, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI + cusparseDpruneCsr2csrNnz(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* threshold, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csr(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const __half* threshold, + const cusparseMatDescr_t descrC, + __half* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csr(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const float* threshold, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneCsr2csr(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + const double* threshold, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + const __half* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + const float* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + const double* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + int* csrRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + int* csrRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + int* csrRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneDense2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + const __half* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + __half* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneDense2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + const float* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneDense2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + const double* A, + int lda, + float percentage, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); + +#if defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + const __half* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); + +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + const float* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + const double* csrSortedValC, + const int* csrSortedRowPtrC, + const int* csrSortedColIndC, + pruneInfo_t info, + size_t* pBufferSizeInBytes); + +#if defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); + +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + int* csrSortedRowPtrC, + int* nnzTotalDevHostPtr, + pruneInfo_t info, + void* pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI +cusparseHpruneCsr2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + __half* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); + +#endif // defined(__cplusplus) + +cusparseStatus_t CUSPARSEAPI +cusparseSpruneCsr2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + float* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDpruneCsr2csrByPercentage(cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrSortedValA, + const int* csrSortedRowPtrA, + const int* csrSortedColIndA, + float percentage, + const cusparseMatDescr_t descrC, + double* csrSortedValC, + const int* csrSortedRowPtrC, + int* csrSortedColIndC, + pruneInfo_t info, + void* pBuffer); + +//############################################################################## +//# CSR2CSC +//############################################################################## + +typedef enum { + CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc + CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc +} cusparseCsr2CscAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseCsr2cscEx2(cusparseHandle_t handle, + int m, + int n, + int nnz, + const void* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cudaDataType valType, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer); + +cusparseStatus_t CUSPARSEAPI +cusparseCsr2cscEx2_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const void* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cudaDataType valType, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize); + +// ############################################################################# +// # GENERIC APIs - Enumerators and Opaque Data Structures +// ############################################################################# + +typedef enum { + CUSPARSE_FORMAT_CSR = 1, ///< Compressed Sparse Row (CSR) + CUSPARSE_FORMAT_CSC = 2, ///< Compressed Sparse Column (CSC) + CUSPARSE_FORMAT_COO = 3, ///< Coordinate (COO) - Structure of Arrays + CUSPARSE_FORMAT_COO_AOS = 4, ///< Coordinate (COO) - Array of Structures + CUSPARSE_FORMAT_BLOCKED_ELL = 5, ///< Blocked ELL +} cusparseFormat_t; + +typedef enum { + CUSPARSE_ORDER_COL = 1, ///< Column-Major Order - Matrix memory layout + CUSPARSE_ORDER_ROW = 2 ///< Row-Major Order - Matrix memory layout +} cusparseOrder_t; + +typedef enum { + CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector + ///< indices + CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices + CUSPARSE_INDEX_64I = 3 ///< 64-bit signed integer for matrix/vector indices +} cusparseIndexType_t; + +//------------------------------------------------------------------------------ + +struct cusparseSpVecDescr; +struct cusparseDnVecDescr; +struct cusparseSpMatDescr; +struct cusparseDnMatDescr; +typedef struct cusparseSpVecDescr* cusparseSpVecDescr_t; +typedef struct cusparseDnVecDescr* cusparseDnVecDescr_t; +typedef struct cusparseSpMatDescr* cusparseSpMatDescr_t; +typedef struct cusparseDnMatDescr* cusparseDnMatDescr_t; + +// ############################################################################# +// # SPARSE VECTOR DESCRIPTOR +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseCreateSpVec(cusparseSpVecDescr_t* spVecDescr, + int64_t size, + int64_t nnz, + void* indices, + void* values, + cusparseIndexType_t idxType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr, + int64_t* size, + int64_t* nnz, + void** indices, + void** values, + cusparseIndexType_t* idxType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVecGetIndexBase(cusparseSpVecDescr_t spVecDescr, + cusparseIndexBase_t* idxBase); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, + void** values); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, + void* values); + +// ############################################################################# +// # DENSE VECTOR DESCRIPTOR +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseCreateDnVec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, + void* values, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr); + +cusparseStatus_t CUSPARSEAPI +cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr, + int64_t* size, + void** values, + cudaDataType* valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, + void** values); + +cusparseStatus_t CUSPARSEAPI +cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, + void* values); + +// ############################################################################# +// # SPARSE MATRIX DESCRIPTOR +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr); + + cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetFormat(cusparseSpMatDescr_t spMatDescr, + cusparseFormat_t* format); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetIndexBase(cusparseSpMatDescr_t spMatDescr, + cusparseIndexBase_t* idxBase); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, + void** values); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, + void* values); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* nnz); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, + int batchCount); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr, + int* batchCount); + +cusparseStatus_t CUSPARSEAPI +cusparseCooSetStridedBatch(cusparseSpMatDescr_t spMatDescr, + int batchCount, + int64_t batchStride); + +cusparseStatus_t CUSPARSEAPI +cusparseCsrSetStridedBatch(cusparseSpMatDescr_t spMatDescr, + int batchCount, + int64_t offsetsBatchStride, + int64_t columnsValuesBatchStride); + +typedef enum { + CUSPARSE_SPMAT_FILL_MODE, + CUSPARSE_SPMAT_DIAG_TYPE +} cusparseSpMatAttribute_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatGetAttribute(cusparseSpMatDescr_t spMatDescr, + cusparseSpMatAttribute_t attribute, + void* data, + size_t dataSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMatSetAttribute(cusparseSpMatDescr_t spMatDescr, + cusparseSpMatAttribute_t attribute, + void* data, + size_t dataSize); + +//------------------------------------------------------------------------------ +// ### CSR ### + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, + int64_t cols, + int64_t nnz, + void* csrRowOffsets, + void* csrColInd, + void* csrValues, + cusparseIndexType_t csrRowOffsetsType, + cusparseIndexType_t csrColIndType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCsc(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, + int64_t cols, + int64_t nnz, + void* cscColOffsets, + void* cscRowInd, + void* cscValues, + cusparseIndexType_t cscColOffsetsType, + cusparseIndexType_t cscRowIndType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCsrGet(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* nnz, + void** csrRowOffsets, + void** csrColInd, + void** csrValues, + cusparseIndexType_t* csrRowOffsetsType, + cusparseIndexType_t* csrColIndType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCscGet(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* nnz, + void** cscColOffsets, + void** cscRowInd, + void** cscValues, + cusparseIndexType_t* cscColOffsetsType, + cusparseIndexType_t* cscRowIndType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, + void* csrRowOffsets, + void* csrColInd, + void* csrValues); + +cusparseStatus_t CUSPARSEAPI +cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr, + void* cscColOffsets, + void* cscRowInd, + void* cscValues); + +//------------------------------------------------------------------------------ +// ### COO ### + +cusparseStatus_t CUSPARSEAPI +cusparseCreateCoo(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, + int64_t cols, + int64_t nnz, + void* cooRowInd, + void* cooColInd, + void* cooValues, + cusparseIndexType_t cooIdxType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +CUSPARSE_DEPRECATED(cusparseCreateCoo) +cusparseStatus_t CUSPARSEAPI +cusparseCreateCooAoS(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, + int64_t cols, + int64_t nnz, + void* cooInd, + void* cooValues, + cusparseIndexType_t cooIdxType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCooGet(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* nnz, + void** cooRowInd, // COO row indices + void** cooColInd, // COO column indices + void** cooValues, // COO values + cusparseIndexType_t* idxType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +CUSPARSE_DEPRECATED(cusparseCooGet) +cusparseStatus_t CUSPARSEAPI +cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* nnz, + void** cooInd, // COO indices + void** cooValues, // COO values + cusparseIndexType_t* idxType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr, + void* cooRows, + void* cooColumns, + void* cooValues); + +//------------------------------------------------------------------------------ +// ### BLOCKED ELL ### + +cusparseStatus_t CUSPARSEAPI +cusparseCreateBlockedEll(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, + int64_t cols, + int64_t ellBlockSize, + int64_t ellCols, + void* ellColInd, + void* ellValue, + cusparseIndexType_t ellIdxType, + cusparseIndexBase_t idxBase, + cudaDataType valueType); + +cusparseStatus_t CUSPARSEAPI +cusparseBlockedEllGet(cusparseSpMatDescr_t spMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* ellBlockSize, + int64_t* ellCols, + void** ellColInd, + void** ellValue, + cusparseIndexType_t* ellIdxType, + cusparseIndexBase_t* idxBase, + cudaDataType* valueType); + +// ############################################################################# +// # DENSE MATRIX DESCRIPTOR +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseCreateDnMat(cusparseDnMatDescr_t* dnMatDescr, + int64_t rows, + int64_t cols, + int64_t ld, + void* values, + cudaDataType valueType, + cusparseOrder_t order); + +cusparseStatus_t CUSPARSEAPI +cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr); + +cusparseStatus_t CUSPARSEAPI +cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr, + int64_t* rows, + int64_t* cols, + int64_t* ld, + void** values, + cudaDataType* type, + cusparseOrder_t* order); + +cusparseStatus_t CUSPARSEAPI +cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, + void** values); + +cusparseStatus_t CUSPARSEAPI +cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, + void* values); + +cusparseStatus_t CUSPARSEAPI +cusparseDnMatSetStridedBatch(cusparseDnMatDescr_t dnMatDescr, + int batchCount, + int64_t batchStride); + +cusparseStatus_t CUSPARSEAPI +cusparseDnMatGetStridedBatch(cusparseDnMatDescr_t dnMatDescr, + int* batchCount, + int64_t* batchStride); + +// ############################################################################# +// # VECTOR-VECTOR OPERATIONS +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseAxpby(cusparseHandle_t handle, + const void* alpha, + cusparseSpVecDescr_t vecX, + const void* beta, + cusparseDnVecDescr_t vecY); + +cusparseStatus_t CUSPARSEAPI +cusparseGather(cusparseHandle_t handle, + cusparseDnVecDescr_t vecY, + cusparseSpVecDescr_t vecX); + +cusparseStatus_t CUSPARSEAPI +cusparseScatter(cusparseHandle_t handle, + cusparseSpVecDescr_t vecX, + cusparseDnVecDescr_t vecY); + +cusparseStatus_t CUSPARSEAPI +cusparseRot(cusparseHandle_t handle, + const void* c_coeff, + const void* s_coeff, + cusparseSpVecDescr_t vecX, + cusparseDnVecDescr_t vecY); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVV_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opX, + cusparseSpVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + const void* result, + cudaDataType computeType, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpVV(cusparseHandle_t handle, + cusparseOperation_t opX, + cusparseSpVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + void* result, + cudaDataType computeType, + void* externalBuffer); + +// ############################################################################# +// # SPARSE TO DENSE +// ############################################################################# + +typedef enum { + CUSPARSE_SPARSETODENSE_ALG_DEFAULT = 0 +} cusparseSparseToDenseAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSparseToDense_bufferSize(cusparseHandle_t handle, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseSparseToDenseAlg_t alg, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSparseToDense(cusparseHandle_t handle, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseSparseToDenseAlg_t alg, + void* externalBuffer); + + +// ############################################################################# +// # DENSE TO SPARSE +// ############################################################################# + +typedef enum { + CUSPARSE_DENSETOSPARSE_ALG_DEFAULT = 0 +} cusparseDenseToSparseAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseDenseToSparse_bufferSize(cusparseHandle_t handle, + cusparseDnMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseDenseToSparseAlg_t alg, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseDenseToSparse_analysis(cusparseHandle_t handle, + cusparseDnMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseDenseToSparseAlg_t alg, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseDenseToSparse_convert(cusparseHandle_t handle, + cusparseDnMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseDenseToSparseAlg_t alg, + void* externalBuffer); + +// ############################################################################# +// # SPARSE MATRIX-VECTOR MULTIPLICATION +// ############################################################################# + +typedef enum { + CUSPARSE_MV_ALG_DEFAULT + /*CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_ALG_DEFAULT)*/ = 0, + CUSPARSE_COOMV_ALG CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_COO_ALG1) = 1, + CUSPARSE_CSRMV_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG1) = 2, + CUSPARSE_CSRMV_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG2) = 3, + CUSPARSE_SPMV_ALG_DEFAULT = 0, + CUSPARSE_SPMV_CSR_ALG1 = 2, + CUSPARSE_SPMV_CSR_ALG2 = 3, + CUSPARSE_SPMV_COO_ALG1 = 1, + CUSPARSE_SPMV_COO_ALG2 = 4 +} cusparseSpMVAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpMV(cusparseHandle_t handle, + cusparseOperation_t opA, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + const void* beta, + cusparseDnVecDescr_t vecY, + cudaDataType computeType, + cusparseSpMVAlg_t alg, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMV_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + const void* beta, + cusparseDnVecDescr_t vecY, + cudaDataType computeType, + cusparseSpMVAlg_t alg, + size_t* bufferSize); + +// ############################################################################# +// # SPARSE TRIANGULAR VECTOR SOLVE +// ############################################################################# + +typedef enum { + CUSPARSE_SPSV_ALG_DEFAULT = 0, +} cusparseSpSVAlg_t; + +struct cusparseSpSVDescr; +typedef struct cusparseSpSVDescr* cusparseSpSVDescr_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpSV_createDescr(cusparseSpSVDescr_t* descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSV_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cudaDataType computeType, + cusparseSpSVAlg_t alg, + cusparseSpSVDescr_t spsvDescr, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSV_analysis(cusparseHandle_t handle, + cusparseOperation_t opA, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cudaDataType computeType, + cusparseSpSVAlg_t alg, + cusparseSpSVDescr_t spsvDescr, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSV_solve(cusparseHandle_t handle, + cusparseOperation_t opA, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cudaDataType computeType, + cusparseSpSVAlg_t alg, + cusparseSpSVDescr_t spsvDescr); + +// ############################################################################# +// # SPARSE TRIANGULAR MATRIX SOLVE +// ############################################################################# + +typedef enum { + CUSPARSE_SPSM_ALG_DEFAULT = 0, +} cusparseSpSMAlg_t; + +struct cusparseSpSMDescr; +typedef struct cusparseSpSMDescr* cusparseSpSMDescr_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpSM_createDescr(cusparseSpSMDescr_t* descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSM_destroyDescr(cusparseSpSMDescr_t descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSM_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpSMAlg_t alg, + cusparseSpSMDescr_t spsmDescr, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSM_analysis(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpSMAlg_t alg, + cusparseSpSMDescr_t spsmDescr, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSpSM_solve(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpSMAlg_t alg, + cusparseSpSMDescr_t spsmDescr); + +// ############################################################################# +// # SPARSE MATRIX-MATRIX MULTIPLICATION +// ############################################################################# + +typedef enum { + CUSPARSE_MM_ALG_DEFAULT + CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, + CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, + CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, + CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, + CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, + CUSPARSE_SPMM_ALG_DEFAULT = 0, + CUSPARSE_SPMM_COO_ALG1 = 1, + CUSPARSE_SPMM_COO_ALG2 = 2, + CUSPARSE_SPMM_COO_ALG3 = 3, + CUSPARSE_SPMM_COO_ALG4 = 5, + CUSPARSE_SPMM_CSR_ALG1 = 4, + CUSPARSE_SPMM_CSR_ALG2 = 6, + CUSPARSE_SPMM_CSR_ALG3 = 12, + CUSPARSE_SPMM_BLOCKED_ELL_ALG1 = 13 +} cusparseSpMMAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpMM_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpMMAlg_t alg, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMM_preprocess(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpMMAlg_t alg, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMM(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpMMAlg_t alg, + void* externalBuffer); + +// ############################################################################# +// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM) +// ############################################################################# + +typedef enum { + CUSPARSE_SPGEMM_DEFAULT = 0, + CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC = 1, + CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2 +} cusparseSpGEMMAlg_t; + +struct cusparseSpGEMMDescr; +typedef struct cusparseSpGEMMDescr* cusparseSpGEMMDescr_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t* descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMM_workEstimation(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr, + size_t* bufferSize1, + void* externalBuffer1); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMM_compute(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr, + size_t* bufferSize2, + void* externalBuffer2); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMM_copy(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr); + +// ############################################################################# +// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM) STRUCTURE REUSE +// ############################################################################# + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMMreuse_workEstimation(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseSpMatDescr_t matC, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr, + size_t* bufferSize1, + void* externalBuffer1); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMMreuse_nnz(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseSpMatDescr_t matC, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr, + size_t* bufferSize2, + void* externalBuffer2, + size_t* bufferSize3, + void* externalBuffer3, + size_t* bufferSize4, + void* externalBuffer4); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMMreuse_copy(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + cusparseSpMatDescr_t matC, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr, + size_t* bufferSize5, + void* externalBuffer5); + +cusparseStatus_t CUSPARSEAPI +cusparseSpGEMMreuse_compute(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseSpMatDescr_t matA, + cusparseSpMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSpGEMMAlg_t alg, + cusparseSpGEMMDescr_t spgemmDescr); + +// ############################################################################# +// # SAMPLED DENSE-DENSE MATRIX MULTIPLICATION +// ############################################################################# + +CUSPARSE_DEPRECATED(cusparseSDDMM) +cusparseStatus_t CUSPARSEAPI +cusparseConstrainedGeMM(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseDnMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + void* externalBuffer); + +CUSPARSE_DEPRECATED(cusparseSDDMM) +cusparseStatus_t CUSPARSEAPI +cusparseConstrainedGeMM_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseDnMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + size_t* bufferSize); + +typedef enum { + CUSPARSE_SDDMM_ALG_DEFAULT = 0 +} cusparseSDDMMAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSDDMM_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseDnMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSDDMMAlg_t alg, + size_t* bufferSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSDDMM_preprocess(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseDnMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSDDMMAlg_t alg, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSDDMM(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const void* alpha, + cusparseDnMatDescr_t matA, + cusparseDnMatDescr_t matB, + const void* beta, + cusparseSpMatDescr_t matC, + cudaDataType computeType, + cusparseSDDMMAlg_t alg, + void* externalBuffer); + +// ############################################################################# +// # GENERIC APIs WITH CUSTOM OPERATORS (PREVIEW) +// ############################################################################# + +struct cusparseSpMMOpPlan; +typedef struct cusparseSpMMOpPlan* cusparseSpMMOpPlan_t; + +typedef enum { + CUSPARSE_SPMM_OP_ALG_DEFAULT +} cusparseSpMMOpAlg_t; + +cusparseStatus_t CUSPARSEAPI +cusparseSpMMOp_createPlan(cusparseHandle_t handle, + cusparseSpMMOpPlan_t* plan, + cusparseOperation_t opA, + cusparseOperation_t opB, + cusparseSpMatDescr_t matA, + cusparseDnMatDescr_t matB, + cusparseDnMatDescr_t matC, + cudaDataType computeType, + cusparseSpMMOpAlg_t alg, + const void* addOperationNvvmBuffer, + size_t addOperationBufferSize, + const void* mulOperationNvvmBuffer, + size_t mulOperationBufferSize, + const void* epilogueNvvmBuffer, + size_t epilogueBufferSize, + size_t* SpMMWorkspaceSize); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMMOp(cusparseSpMMOpPlan_t plan, + void* externalBuffer); + +cusparseStatus_t CUSPARSEAPI +cusparseSpMMOp_destroyPlan(cusparseSpMMOpPlan_t plan); + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) +} // extern "C" +#endif // defined(__cplusplus) + +#undef CUSPARSE_DEPRECATED +#undef CUSPARSE_PREVIEW + +#endif // !defined(CUSPARSE_H_) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10c57451644fd7d608d0090a9af6b54e36fbe445 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ece81b45e771af99cf98d4237b4728b4c3dcff6b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h new file mode 100644 index 0000000000000000000000000000000000000000..aae5625bdb8850e045ca04711ff5da845511fbf2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h @@ -0,0 +1,448 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include +#if CUDART_VERSION >= 11000 +#include +#endif + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 20 +#define NCCL_PATCH 5 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 22005 +#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) + +#ifdef __cplusplus +extern "C" { +#endif + +#include +/* Opaque handle to communicator */ +typedef struct ncclComm* ncclComm_t; +#define NCCL_COMM_NULL NULL + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +/* Error type */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 + +/* Communicator configuration. Users can assign value to attributes to specify the + * behavior of a communicator. */ +typedef struct ncclConfig_v21700 { + /* attributes that users should never touch. */ + size_t size; + unsigned int magic; + unsigned int version; + /* attributes that users are able to customize. */ + int blocking; + int cgaClusterSize; + int minCTAs; + int maxCTAs; + const char *netName; + int splitShare; +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in NCCL error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ +} + +/* NCCL malloc and free function for all types of NCCL optimizations + * (e.g. user buffer registration). The actual allocated size might + * be larger than requested due to granularity requirement. */ +ncclResult_t ncclMemAlloc(void** ptr, size_t size); +ncclResult_t pncclMemAlloc(void** ptr, size_t size); + +ncclResult_t ncclMemFree(void *ptr); +ncclResult_t pncclMemFree(void *ptr); + +/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ +ncclResult_t ncclGetVersion(int *version); +ncclResult_t pncclGetVersion(int *version); + +/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be + * called once and the Id should be distributed to all ranks in the + * communicator before calling ncclCommInitRank. */ +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + +/* Create a new communicator (multi thread/process version) with a configuration + * set by users. */ +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); + +/* Creates a new communicator (multi thread/process version). + * rank must be between 0 and nranks-1 and unique within a communicator clique. + * Each rank is associated to a CUDA device, which has to be set before calling + * ncclCommInitRank. + * ncclCommInitRank implicitly syncronizes with other ranks, so it must be + * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + +/* Creates a clique of communicators (single process version). + * This is a convenience function to create a single-process communicator clique. + * Returns an array of ndev newly initialized communicators in comm. + * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + * If devlist is NULL, the first ndev CUDA devices are used. + * Order of devlist defines user-order of processors within the communicator. */ +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + +/* Finalize a communicator. ncclCommFinalize flushes all issued communications, + * and marks communicator state as ncclInProgress. The state will change to ncclSuccess + * when the communicator is globally quiescent and related resources are freed; then, + * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + * itself) without blocking. */ +ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t pncclCommFinalize(ncclComm_t comm); + +/* Frees local resources associated with communicator object. */ +ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t pncclCommDestroy(ncclComm_t comm); + +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + +/* Creates one or more communicators from an existing one. + * Ranks with the same color will end up in the same communicator. + * Within the new communicator, key will be used to order ranks. + * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + * and will therefore return a NULL communicator. + * If config is NULL, the new communicator will inherit the original communicator's + * configuration*/ +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); + +/* Returns a string for each error code. */ +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); + +/* Returns a human-readable message of the last error that occurred. */ +const char* ncclGetLastError(ncclComm_t comm); +const char* pncclGetLastError(ncclComm_t comm); + +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + +/* Gets the number of ranks in the communicator clique. */ +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + +/* Returns the cuda device number associated with the communicator. */ +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + +/* Returns the user-ordered "rank" associated with the communicator. */ +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + + +/* Register CUDA buffer for zero-copy operation */ +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); + +/* Deregister CUDA buffer */ +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); +ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); + +/* Reduction operation selector */ +typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, +#if defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 = 9, + ncclNumTypes = 10 +#else + ncclNumTypes = 9 +#endif +} ncclDataType_t; + +/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ +typedef enum { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 +} ncclScalarResidence_t; + +/* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); + +/* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ +ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + +/* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + +/* + * Reduce + * + * Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + +/* + * (deprecated) Broadcast (in-place) + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * Broadcast + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * All-Reduce + * + * Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + +/* + * Reduce-Scatter + * + * Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); + +/* + * All-Gather + * + * Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +/* + * Send + * + * Send data from sendbuff to rank peer. + * + * Rank peer needs to call ncclRecv with the same datatype and the same count from this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Receive + * + * Receive data from rank peer into recvbuff. + * + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. + */ + +/* + * Group Start + * + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. + */ +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); + +/* + * Group End + * + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. + */ +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); + +/* Register CUDA buffer for zero-copy operation */ +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); + +/* Deregister CUDA buffer */ +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); +ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py new file mode 100644 index 0000000000000000000000000000000000000000..2c84208a5d87511cc4a63dcd9c647ac75c6f4475 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import logging +from argparse import ArgumentParser +from typing import TYPE_CHECKING + +from pip._vendor import requests + +from pip._vendor.cachecontrol.adapter import CacheControlAdapter +from pip._vendor.cachecontrol.cache import DictCache +from pip._vendor.cachecontrol.controller import logger + +if TYPE_CHECKING: + from argparse import Namespace + + from pip._vendor.cachecontrol.controller import CacheController + + +def setup_logging() -> None: + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + logger.addHandler(handler) + + +def get_session() -> requests.Session: + adapter = CacheControlAdapter( + DictCache(), cache_etags=True, serializer=None, heuristic=None + ) + sess = requests.Session() + sess.mount("http://", adapter) + sess.mount("https://", adapter) + + sess.cache_controller = adapter.controller # type: ignore[attr-defined] + return sess + + +def get_args() -> Namespace: + parser = ArgumentParser() + parser.add_argument("url", help="The URL to try and cache") + return parser.parse_args() + + +def main() -> None: + args = get_args() + sess = get_session() + + # Make a request to get a response + resp = sess.get(args.url) + + # Turn on logging + setup_logging() + + # try setting the cache + cache_controller: CacheController = ( + sess.cache_controller # type: ignore[attr-defined] + ) + cache_controller.cache_response(resp.request, resp.raw) + + # Now try to get it + if cache_controller.cached_request(resp.request): + print("Cached!") + else: + print("Not cached :(") + + +if __name__ == "__main__": + main() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..fbb4ecc88762f8a3602417e2bab446f97b319196 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py @@ -0,0 +1,161 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import functools +import types +import zlib +from typing import TYPE_CHECKING, Any, Collection, Mapping + +from pip._vendor.requests.adapters import HTTPAdapter + +from pip._vendor.cachecontrol.cache import DictCache +from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController +from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper + +if TYPE_CHECKING: + from pip._vendor.requests import PreparedRequest, Response + from pip._vendor.urllib3 import HTTPResponse + + from pip._vendor.cachecontrol.cache import BaseCache + from pip._vendor.cachecontrol.heuristics import BaseHeuristic + from pip._vendor.cachecontrol.serialize import Serializer + + +class CacheControlAdapter(HTTPAdapter): + invalidating_methods = {"PUT", "PATCH", "DELETE"} + + def __init__( + self, + cache: BaseCache | None = None, + cache_etags: bool = True, + controller_class: type[CacheController] | None = None, + serializer: Serializer | None = None, + heuristic: BaseHeuristic | None = None, + cacheable_methods: Collection[str] | None = None, + *args: Any, + **kw: Any, + ) -> None: + super().__init__(*args, **kw) + self.cache = DictCache() if cache is None else cache + self.heuristic = heuristic + self.cacheable_methods = cacheable_methods or ("GET",) + + controller_factory = controller_class or CacheController + self.controller = controller_factory( + self.cache, cache_etags=cache_etags, serializer=serializer + ) + + def send( + self, + request: PreparedRequest, + stream: bool = False, + timeout: None | float | tuple[float, float] | tuple[float, None] = None, + verify: bool | str = True, + cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None, + proxies: Mapping[str, str] | None = None, + cacheable_methods: Collection[str] | None = None, + ) -> Response: + """ + Send a request. Use the request information to see if it + exists in the cache and cache the response if we need to and can. + """ + cacheable = cacheable_methods or self.cacheable_methods + if request.method in cacheable: + try: + cached_response = self.controller.cached_request(request) + except zlib.error: + cached_response = None + if cached_response: + return self.build_response(request, cached_response, from_cache=True) + + # check for etags and add headers if appropriate + request.headers.update(self.controller.conditional_headers(request)) + + resp = super().send(request, stream, timeout, verify, cert, proxies) + + return resp + + def build_response( + self, + request: PreparedRequest, + response: HTTPResponse, + from_cache: bool = False, + cacheable_methods: Collection[str] | None = None, + ) -> Response: + """ + Build a response by making a request or using the cache. + + This will end up calling send and returning a potentially + cached response + """ + cacheable = cacheable_methods or self.cacheable_methods + if not from_cache and request.method in cacheable: + # Check for any heuristics that might update headers + # before trying to cache. + if self.heuristic: + response = self.heuristic.apply(response) + + # apply any expiration heuristics + if response.status == 304: + # We must have sent an ETag request. This could mean + # that we've been expired already or that we simply + # have an etag. In either case, we want to try and + # update the cache if that is the case. + cached_response = self.controller.update_cached_response( + request, response + ) + + if cached_response is not response: + from_cache = True + + # We are done with the server response, read a + # possible response body (compliant servers will + # not return one, but we cannot be 100% sure) and + # release the connection back to the pool. + response.read(decode_content=False) + response.release_conn() + + response = cached_response + + # We always cache the 301 responses + elif int(response.status) in PERMANENT_REDIRECT_STATUSES: + self.controller.cache_response(request, response) + else: + # Wrap the response file with a wrapper that will cache the + # response when the stream has been consumed. + response._fp = CallbackFileWrapper( # type: ignore[assignment] + response._fp, # type: ignore[arg-type] + functools.partial( + self.controller.cache_response, request, response + ), + ) + if response.chunked: + super_update_chunk_length = response._update_chunk_length + + def _update_chunk_length(self: HTTPResponse) -> None: + super_update_chunk_length() + if self.chunk_left == 0: + self._fp._close() # type: ignore[union-attr] + + response._update_chunk_length = types.MethodType( # type: ignore[method-assign] + _update_chunk_length, response + ) + + resp: Response = super().build_response(request, response) # type: ignore[no-untyped-call] + + # See if we should invalidate the cache. + if request.method in self.invalidating_methods and resp.ok: + assert request.url is not None + cache_url = self.controller.cache_url(request.url) + self.cache.delete(cache_url) + + # Give the request a from_cache attr to let people use it + resp.from_cache = from_cache # type: ignore[attr-defined] + + return resp + + def close(self) -> None: + self.cache.close() + super().close() # type: ignore[no-untyped-call] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py new file mode 100644 index 0000000000000000000000000000000000000000..3293b0057c789e4bfeafb926cb9647f280ce309e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + +""" +The cache object API for implementing caches. The default is a thread +safe in-memory dictionary. +""" +from __future__ import annotations + +from threading import Lock +from typing import IO, TYPE_CHECKING, MutableMapping + +if TYPE_CHECKING: + from datetime import datetime + + +class BaseCache: + def get(self, key: str) -> bytes | None: + raise NotImplementedError() + + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: + raise NotImplementedError() + + def delete(self, key: str) -> None: + raise NotImplementedError() + + def close(self) -> None: + pass + + +class DictCache(BaseCache): + def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None: + self.lock = Lock() + self.data = init_dict or {} + + def get(self, key: str) -> bytes | None: + return self.data.get(key, None) + + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: + with self.lock: + self.data.update({key: value}) + + def delete(self, key: str) -> None: + with self.lock: + if key in self.data: + self.data.pop(key) + + +class SeparateBodyBaseCache(BaseCache): + """ + In this variant, the body is not stored mixed in with the metadata, but is + passed in (as a bytes-like object) in a separate call to ``set_body()``. + + That is, the expected interaction pattern is:: + + cache.set(key, serialized_metadata) + cache.set_body(key) + + Similarly, the body should be loaded separately via ``get_body()``. + """ + + def set_body(self, key: str, body: bytes) -> None: + raise NotImplementedError() + + def get_body(self, key: str) -> IO[bytes] | None: + """ + Return the body as file-like object. + """ + raise NotImplementedError() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py new file mode 100644 index 0000000000000000000000000000000000000000..d7dd86e5f702ea0dd156b9cb8217784dbc5488eb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py @@ -0,0 +1,499 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + +""" +The httplib2 algorithms ported for use with requests. +""" +from __future__ import annotations + +import calendar +import logging +import re +import time +from email.utils import parsedate_tz +from typing import TYPE_CHECKING, Collection, Mapping + +from pip._vendor.requests.structures import CaseInsensitiveDict + +from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache +from pip._vendor.cachecontrol.serialize import Serializer + +if TYPE_CHECKING: + from typing import Literal + + from pip._vendor.requests import PreparedRequest + from pip._vendor.urllib3 import HTTPResponse + + from pip._vendor.cachecontrol.cache import BaseCache + +logger = logging.getLogger(__name__) + +URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") + +PERMANENT_REDIRECT_STATUSES = (301, 308) + + +def parse_uri(uri: str) -> tuple[str, str, str, str, str]: + """Parses a URI using the regex given in Appendix B of RFC 3986. + + (scheme, authority, path, query, fragment) = parse_uri(uri) + """ + match = URI.match(uri) + assert match is not None + groups = match.groups() + return (groups[1], groups[3], groups[4], groups[6], groups[8]) + + +class CacheController: + """An interface to see if request should cached or not.""" + + def __init__( + self, + cache: BaseCache | None = None, + cache_etags: bool = True, + serializer: Serializer | None = None, + status_codes: Collection[int] | None = None, + ): + self.cache = DictCache() if cache is None else cache + self.cache_etags = cache_etags + self.serializer = serializer or Serializer() + self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) + + @classmethod + def _urlnorm(cls, uri: str) -> str: + """Normalize the URL to create a safe key for the cache""" + (scheme, authority, path, query, fragment) = parse_uri(uri) + if not scheme or not authority: + raise Exception("Only absolute URIs are allowed. uri = %s" % uri) + + scheme = scheme.lower() + authority = authority.lower() + + if not path: + path = "/" + + # Could do syntax based normalization of the URI before + # computing the digest. See Section 6.2.2 of Std 66. + request_uri = query and "?".join([path, query]) or path + defrag_uri = scheme + "://" + authority + request_uri + + return defrag_uri + + @classmethod + def cache_url(cls, uri: str) -> str: + return cls._urlnorm(uri) + + def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]: + known_directives = { + # https://tools.ietf.org/html/rfc7234#section-5.2 + "max-age": (int, True), + "max-stale": (int, False), + "min-fresh": (int, True), + "no-cache": (None, False), + "no-store": (None, False), + "no-transform": (None, False), + "only-if-cached": (None, False), + "must-revalidate": (None, False), + "public": (None, False), + "private": (None, False), + "proxy-revalidate": (None, False), + "s-maxage": (int, True), + } + + cc_headers = headers.get("cache-control", headers.get("Cache-Control", "")) + + retval: dict[str, int | None] = {} + + for cc_directive in cc_headers.split(","): + if not cc_directive.strip(): + continue + + parts = cc_directive.split("=", 1) + directive = parts[0].strip() + + try: + typ, required = known_directives[directive] + except KeyError: + logger.debug("Ignoring unknown cache-control directive: %s", directive) + continue + + if not typ or not required: + retval[directive] = None + if typ: + try: + retval[directive] = typ(parts[1].strip()) + except IndexError: + if required: + logger.debug( + "Missing value for cache-control " "directive: %s", + directive, + ) + except ValueError: + logger.debug( + "Invalid value for cache-control directive " "%s, must be %s", + directive, + typ.__name__, + ) + + return retval + + def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None: + """ + Load a cached response, or return None if it's not available. + """ + # We do not support caching of partial content: so if the request contains a + # Range header then we don't want to load anything from the cache. + if "Range" in request.headers: + return None + + cache_url = request.url + assert cache_url is not None + cache_data = self.cache.get(cache_url) + if cache_data is None: + logger.debug("No cache entry available") + return None + + if isinstance(self.cache, SeparateBodyBaseCache): + body_file = self.cache.get_body(cache_url) + else: + body_file = None + + result = self.serializer.loads(request, cache_data, body_file) + if result is None: + logger.warning("Cache entry deserialization failed, entry ignored") + return result + + def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]: + """ + Return a cached response if it exists in the cache, otherwise + return False. + """ + assert request.url is not None + cache_url = self.cache_url(request.url) + logger.debug('Looking up "%s" in the cache', cache_url) + cc = self.parse_cache_control(request.headers) + + # Bail out if the request insists on fresh data + if "no-cache" in cc: + logger.debug('Request header has "no-cache", cache bypassed') + return False + + if "max-age" in cc and cc["max-age"] == 0: + logger.debug('Request header has "max_age" as 0, cache bypassed') + return False + + # Check whether we can load the response from the cache: + resp = self._load_from_cache(request) + if not resp: + return False + + # If we have a cached permanent redirect, return it immediately. We + # don't need to test our response for other headers b/c it is + # intrinsically "cacheable" as it is Permanent. + # + # See: + # https://tools.ietf.org/html/rfc7231#section-6.4.2 + # + # Client can try to refresh the value by repeating the request + # with cache busting headers as usual (ie no-cache). + if int(resp.status) in PERMANENT_REDIRECT_STATUSES: + msg = ( + "Returning cached permanent redirect response " + "(ignoring date and etag information)" + ) + logger.debug(msg) + return resp + + headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers) + if not headers or "date" not in headers: + if "etag" not in headers: + # Without date or etag, the cached response can never be used + # and should be deleted. + logger.debug("Purging cached response: no date or etag") + self.cache.delete(cache_url) + logger.debug("Ignoring cached response: no date") + return False + + now = time.time() + time_tuple = parsedate_tz(headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) + current_age = max(0, now - date) + logger.debug("Current age based on date: %i", current_age) + + # TODO: There is an assumption that the result will be a + # urllib3 response object. This may not be best since we + # could probably avoid instantiating or constructing the + # response until we know we need it. + resp_cc = self.parse_cache_control(headers) + + # determine freshness + freshness_lifetime = 0 + + # Check the max-age pragma in the cache control header + max_age = resp_cc.get("max-age") + if max_age is not None: + freshness_lifetime = max_age + logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime) + + # If there isn't a max-age, check for an expires header + elif "expires" in headers: + expires = parsedate_tz(headers["expires"]) + if expires is not None: + expire_time = calendar.timegm(expires[:6]) - date + freshness_lifetime = max(0, expire_time) + logger.debug("Freshness lifetime from expires: %i", freshness_lifetime) + + # Determine if we are setting freshness limit in the + # request. Note, this overrides what was in the response. + max_age = cc.get("max-age") + if max_age is not None: + freshness_lifetime = max_age + logger.debug( + "Freshness lifetime from request max-age: %i", freshness_lifetime + ) + + min_fresh = cc.get("min-fresh") + if min_fresh is not None: + # adjust our current age by our min fresh + current_age += min_fresh + logger.debug("Adjusted current age from min-fresh: %i", current_age) + + # Return entry if it is fresh enough + if freshness_lifetime > current_age: + logger.debug('The response is "fresh", returning cached response') + logger.debug("%i > %i", freshness_lifetime, current_age) + return resp + + # we're not fresh. If we don't have an Etag, clear it out + if "etag" not in headers: + logger.debug('The cached response is "stale" with no etag, purging') + self.cache.delete(cache_url) + + # return the original handler + return False + + def conditional_headers(self, request: PreparedRequest) -> dict[str, str]: + resp = self._load_from_cache(request) + new_headers = {} + + if resp: + headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers) + + if "etag" in headers: + new_headers["If-None-Match"] = headers["ETag"] + + if "last-modified" in headers: + new_headers["If-Modified-Since"] = headers["Last-Modified"] + + return new_headers + + def _cache_set( + self, + cache_url: str, + request: PreparedRequest, + response: HTTPResponse, + body: bytes | None = None, + expires_time: int | None = None, + ) -> None: + """ + Store the data in the cache. + """ + if isinstance(self.cache, SeparateBodyBaseCache): + # We pass in the body separately; just put a placeholder empty + # string in the metadata. + self.cache.set( + cache_url, + self.serializer.dumps(request, response, b""), + expires=expires_time, + ) + # body is None can happen when, for example, we're only updating + # headers, as is the case in update_cached_response(). + if body is not None: + self.cache.set_body(cache_url, body) + else: + self.cache.set( + cache_url, + self.serializer.dumps(request, response, body), + expires=expires_time, + ) + + def cache_response( + self, + request: PreparedRequest, + response: HTTPResponse, + body: bytes | None = None, + status_codes: Collection[int] | None = None, + ) -> None: + """ + Algorithm for caching requests. + + This assumes a requests Response object. + """ + # From httplib2: Don't cache 206's since we aren't going to + # handle byte range requests + cacheable_status_codes = status_codes or self.cacheable_status_codes + if response.status not in cacheable_status_codes: + logger.debug( + "Status code %s not in %s", response.status, cacheable_status_codes + ) + return + + response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + response.headers + ) + + if "date" in response_headers: + time_tuple = parsedate_tz(response_headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) + else: + date = 0 + + # If we've been given a body, our response has a Content-Length, that + # Content-Length is valid then we can check to see if the body we've + # been given matches the expected size, and if it doesn't we'll just + # skip trying to cache it. + if ( + body is not None + and "content-length" in response_headers + and response_headers["content-length"].isdigit() + and int(response_headers["content-length"]) != len(body) + ): + return + + cc_req = self.parse_cache_control(request.headers) + cc = self.parse_cache_control(response_headers) + + assert request.url is not None + cache_url = self.cache_url(request.url) + logger.debug('Updating cache with response from "%s"', cache_url) + + # Delete it from the cache if we happen to have it stored there + no_store = False + if "no-store" in cc: + no_store = True + logger.debug('Response header has "no-store"') + if "no-store" in cc_req: + no_store = True + logger.debug('Request header has "no-store"') + if no_store and self.cache.get(cache_url): + logger.debug('Purging existing cache entry to honor "no-store"') + self.cache.delete(cache_url) + if no_store: + return + + # https://tools.ietf.org/html/rfc7234#section-4.1: + # A Vary header field-value of "*" always fails to match. + # Storing such a response leads to a deserialization warning + # during cache lookup and is not allowed to ever be served, + # so storing it can be avoided. + if "*" in response_headers.get("vary", ""): + logger.debug('Response header has "Vary: *"') + return + + # If we've been given an etag, then keep the response + if self.cache_etags and "etag" in response_headers: + expires_time = 0 + if response_headers.get("expires"): + expires = parsedate_tz(response_headers["expires"]) + if expires is not None: + expires_time = calendar.timegm(expires[:6]) - date + + expires_time = max(expires_time, 14 * 86400) + + logger.debug(f"etag object cached for {expires_time} seconds") + logger.debug("Caching due to etag") + self._cache_set(cache_url, request, response, body, expires_time) + + # Add to the cache any permanent redirects. We do this before looking + # that the Date headers. + elif int(response.status) in PERMANENT_REDIRECT_STATUSES: + logger.debug("Caching permanent redirect") + self._cache_set(cache_url, request, response, b"") + + # Add to the cache if the response headers demand it. If there + # is no date header then we can't do anything about expiring + # the cache. + elif "date" in response_headers: + time_tuple = parsedate_tz(response_headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) + # cache when there is a max-age > 0 + max_age = cc.get("max-age") + if max_age is not None and max_age > 0: + logger.debug("Caching b/c date exists and max-age > 0") + expires_time = max_age + self._cache_set( + cache_url, + request, + response, + body, + expires_time, + ) + + # If the request can expire, it means we should cache it + # in the meantime. + elif "expires" in response_headers: + if response_headers["expires"]: + expires = parsedate_tz(response_headers["expires"]) + if expires is not None: + expires_time = calendar.timegm(expires[:6]) - date + else: + expires_time = None + + logger.debug( + "Caching b/c of expires header. expires in {} seconds".format( + expires_time + ) + ) + self._cache_set( + cache_url, + request, + response, + body, + expires_time, + ) + + def update_cached_response( + self, request: PreparedRequest, response: HTTPResponse + ) -> HTTPResponse: + """On a 304 we will get a new set of headers that we want to + update our cached value with, assuming we have one. + + This should only ever be called when we've sent an ETag and + gotten a 304 as the response. + """ + assert request.url is not None + cache_url = self.cache_url(request.url) + cached_response = self._load_from_cache(request) + + if not cached_response: + # we didn't have a cached response + return response + + # Lets update our headers with the headers from the new request: + # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1 + # + # The server isn't supposed to send headers that would make + # the cached body invalid. But... just in case, we'll be sure + # to strip out ones we know that might be problmatic due to + # typical assumptions. + excluded_headers = ["content-length"] + + cached_response.headers.update( + { + k: v + for k, v in response.headers.items() + if k.lower() not in excluded_headers + } + ) + + # we want a 200 b/c we have content via the cache + cached_response.status = 200 + + # update our cache + self._cache_set(cache_url, request, cached_response) + + return cached_response diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..25143902a26b6c335c34a8304ba780ac15fb6704 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import mmap +from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from http.client import HTTPResponse + + +class CallbackFileWrapper: + """ + Small wrapper around a fp object which will tee everything read into a + buffer, and when that file is closed it will execute a callback with the + contents of that buffer. + + All attributes are proxied to the underlying file object. + + This class uses members with a double underscore (__) leading prefix so as + not to accidentally shadow an attribute. + + The data is stored in a temporary file until it is all available. As long + as the temporary files directory is disk-based (sometimes it's a + memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory + pressure is high. For small files the disk usually won't be used at all, + it'll all be in the filesystem memory cache, so there should be no + performance impact. + """ + + def __init__( + self, fp: HTTPResponse, callback: Callable[[bytes], None] | None + ) -> None: + self.__buf = NamedTemporaryFile("rb+", delete=True) + self.__fp = fp + self.__callback = callback + + def __getattr__(self, name: str) -> Any: + # The vaguaries of garbage collection means that self.__fp is + # not always set. By using __getattribute__ and the private + # name[0] allows looking up the attribute value and raising an + # AttributeError when it doesn't exist. This stop thigns from + # infinitely recursing calls to getattr in the case where + # self.__fp hasn't been set. + # + # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers + fp = self.__getattribute__("_CallbackFileWrapper__fp") + return getattr(fp, name) + + def __is_fp_closed(self) -> bool: + try: + return self.__fp.fp is None + + except AttributeError: + pass + + try: + closed: bool = self.__fp.closed + return closed + + except AttributeError: + pass + + # We just don't cache it then. + # TODO: Add some logging here... + return False + + def _close(self) -> None: + if self.__callback: + if self.__buf.tell() == 0: + # Empty file: + result = b"" + else: + # Return the data without actually loading it into memory, + # relying on Python's buffer API and mmap(). mmap() just gives + # a view directly into the filesystem's memory cache, so it + # doesn't result in duplicate memory use. + self.__buf.seek(0, 0) + result = memoryview( + mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ) + ) + self.__callback(result) + + # We assign this to None here, because otherwise we can get into + # really tricky problems where the CPython interpreter dead locks + # because the callback is holding a reference to something which + # has a __del__ method. Setting this to None breaks the cycle + # and allows the garbage collector to do it's thing normally. + self.__callback = None + + # Closing the temporary file releases memory and frees disk space. + # Important when caching big files. + self.__buf.close() + + def read(self, amt: int | None = None) -> bytes: + data: bytes = self.__fp.read(amt) + if data: + # We may be dealing with b'', a sign that things are over: + # it's passed e.g. after we've already closed self.__buf. + self.__buf.write(data) + if self.__is_fp_closed(): + self._close() + + return data + + def _safe_read(self, amt: int) -> bytes: + data: bytes = self.__fp._safe_read(amt) # type: ignore[attr-defined] + if amt == 2 and data == b"\r\n": + # urllib executes this read to toss the CRLF at the end + # of the chunk. + return data + + self.__buf.write(data) + if self.__is_fp_closed(): + self._close() + + return data diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..f618bc363f12fb416b048e2a86cbddb6082874d6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from typing import TYPE_CHECKING, Collection + +from pip._vendor.cachecontrol.adapter import CacheControlAdapter +from pip._vendor.cachecontrol.cache import DictCache + +if TYPE_CHECKING: + from pip._vendor import requests + + from pip._vendor.cachecontrol.cache import BaseCache + from pip._vendor.cachecontrol.controller import CacheController + from pip._vendor.cachecontrol.heuristics import BaseHeuristic + from pip._vendor.cachecontrol.serialize import Serializer + + +def CacheControl( + sess: requests.Session, + cache: BaseCache | None = None, + cache_etags: bool = True, + serializer: Serializer | None = None, + heuristic: BaseHeuristic | None = None, + controller_class: type[CacheController] | None = None, + adapter_class: type[CacheControlAdapter] | None = None, + cacheable_methods: Collection[str] | None = None, +) -> requests.Session: + cache = DictCache() if cache is None else cache + adapter_class = adapter_class or CacheControlAdapter + adapter = adapter_class( + cache, + cache_etags=cache_etags, + serializer=serializer, + heuristic=heuristic, + controller_class=controller_class, + cacheable_methods=cacheable_methods, + ) + sess.mount("http://", adapter) + sess.mount("https://", adapter) + + return sess diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aaa6e23166c7a5d57d8fde7485e8c84777f4eec Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..874dda964ece6caf02586ed3e9a7a14f1fd7564d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d92acc7bedfc5c7c05130986a256e610640582e5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py @@ -0,0 +1,26 @@ +__all__ = [ + "__version__", + "AbstractProvider", + "AbstractResolver", + "BaseReporter", + "InconsistentCandidate", + "Resolver", + "RequirementsConflicted", + "ResolutionError", + "ResolutionImpossible", + "ResolutionTooDeep", +] + +__version__ = "1.0.1" + + +from .providers import AbstractProvider, AbstractResolver +from .reporters import BaseReporter +from .resolvers import ( + InconsistentCandidate, + RequirementsConflicted, + ResolutionError, + ResolutionImpossible, + ResolutionTooDeep, + Resolver, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd71034ff325c52eecd0c5d11e805174bb7a04a7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aacc1e18664aaf0169a8dbc9dd62ee82871869f7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5959becda620237033f79edc05a79c29c5be11a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py new file mode 100644 index 0000000000000000000000000000000000000000..688b5e10d8608fdb324c5df0ec3d9f4aa720de0e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py @@ -0,0 +1,43 @@ +class BaseReporter(object): + """Delegate class to provider progress reporting for the resolver.""" + + def starting(self): + """Called before the resolution actually starts.""" + + def starting_round(self, index): + """Called before each round of resolution starts. + + The index is zero-based. + """ + + def ending_round(self, index, state): + """Called before each round of resolution ends. + + This is NOT called if the resolution ends at this round. Use `ending` + if you want to report finalization. The index is zero-based. + """ + + def ending(self, state): + """Called before the resolution ends successfully.""" + + def adding_requirement(self, requirement, parent): + """Called when adding a new requirement into the resolve criteria. + + :param requirement: The additional requirement to be applied to filter + the available candidaites. + :param parent: The candidate that requires ``requirement`` as a + dependency, or None if ``requirement`` is one of the root + requirements passed in from ``Resolver.resolve()``. + """ + + def resolving_conflicts(self, causes): + """Called when starting to attempt requirement conflict resolution. + + :param causes: The information on the collision that caused the backtracking. + """ + + def rejecting_candidate(self, criterion, candidate): + """Called when rejecting a candidate during backtracking.""" + + def pinning(self, candidate): + """Called when adding a candidate to the potential solution.""" diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8afaadc45b70f43f28acdf1e6dee6aef3c402ede Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89ca4ff6dd3d4720dde22e9532f297659f3a4ddb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eedee3c7b1956e04dfd95ebeb25f12c7d2271926 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py new file mode 100644 index 0000000000000000000000000000000000000000..345030772441102eda1be73661469a5f726d668a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py @@ -0,0 +1,571 @@ +import contextlib +import ctypes +import platform +import ssl +import typing +from ctypes import ( + CDLL, + POINTER, + c_bool, + c_char_p, + c_int32, + c_long, + c_uint32, + c_ulong, + c_void_p, +) +from ctypes.util import find_library + +from ._ssl_constants import _set_ssl_context_verify_mode + +_mac_version = platform.mac_ver()[0] +_mac_version_info = tuple(map(int, _mac_version.split("."))) +if _mac_version_info < (10, 8): + raise ImportError( + f"Only OS X 10.8 and newer are supported, not {_mac_version_info[0]}.{_mac_version_info[1]}" + ) + +_is_macos_version_10_14_or_later = _mac_version_info >= (10, 14) + + +def _load_cdll(name: str, macos10_16_path: str) -> CDLL: + """Loads a CDLL by name, falling back to known path on 10.16+""" + try: + # Big Sur is technically 11 but we use 10.16 due to the Big Sur + # beta being labeled as 10.16. + path: str | None + if _mac_version_info >= (10, 16): + path = macos10_16_path + else: + path = find_library(name) + if not path: + raise OSError # Caught and reraised as 'ImportError' + return CDLL(path, use_errno=True) + except OSError: + raise ImportError(f"The library {name} failed to load") from None + + +Security = _load_cdll( + "Security", "/System/Library/Frameworks/Security.framework/Security" +) +CoreFoundation = _load_cdll( + "CoreFoundation", + "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation", +) + +Boolean = c_bool +CFIndex = c_long +CFStringEncoding = c_uint32 +CFData = c_void_p +CFString = c_void_p +CFArray = c_void_p +CFMutableArray = c_void_p +CFError = c_void_p +CFType = c_void_p +CFTypeID = c_ulong +CFTypeRef = POINTER(CFType) +CFAllocatorRef = c_void_p + +OSStatus = c_int32 + +CFErrorRef = POINTER(CFError) +CFDataRef = POINTER(CFData) +CFStringRef = POINTER(CFString) +CFArrayRef = POINTER(CFArray) +CFMutableArrayRef = POINTER(CFMutableArray) +CFArrayCallBacks = c_void_p +CFOptionFlags = c_uint32 + +SecCertificateRef = POINTER(c_void_p) +SecPolicyRef = POINTER(c_void_p) +SecTrustRef = POINTER(c_void_p) +SecTrustResultType = c_uint32 +SecTrustOptionFlags = c_uint32 + +try: + Security.SecCertificateCreateWithData.argtypes = [CFAllocatorRef, CFDataRef] + Security.SecCertificateCreateWithData.restype = SecCertificateRef + + Security.SecCertificateCopyData.argtypes = [SecCertificateRef] + Security.SecCertificateCopyData.restype = CFDataRef + + Security.SecCopyErrorMessageString.argtypes = [OSStatus, c_void_p] + Security.SecCopyErrorMessageString.restype = CFStringRef + + Security.SecTrustSetAnchorCertificates.argtypes = [SecTrustRef, CFArrayRef] + Security.SecTrustSetAnchorCertificates.restype = OSStatus + + Security.SecTrustSetAnchorCertificatesOnly.argtypes = [SecTrustRef, Boolean] + Security.SecTrustSetAnchorCertificatesOnly.restype = OSStatus + + Security.SecPolicyCreateRevocation.argtypes = [CFOptionFlags] + Security.SecPolicyCreateRevocation.restype = SecPolicyRef + + Security.SecPolicyCreateSSL.argtypes = [Boolean, CFStringRef] + Security.SecPolicyCreateSSL.restype = SecPolicyRef + + Security.SecTrustCreateWithCertificates.argtypes = [ + CFTypeRef, + CFTypeRef, + POINTER(SecTrustRef), + ] + Security.SecTrustCreateWithCertificates.restype = OSStatus + + Security.SecTrustGetTrustResult.argtypes = [ + SecTrustRef, + POINTER(SecTrustResultType), + ] + Security.SecTrustGetTrustResult.restype = OSStatus + + Security.SecTrustEvaluate.argtypes = [ + SecTrustRef, + POINTER(SecTrustResultType), + ] + Security.SecTrustEvaluate.restype = OSStatus + + Security.SecTrustRef = SecTrustRef # type: ignore[attr-defined] + Security.SecTrustResultType = SecTrustResultType # type: ignore[attr-defined] + Security.OSStatus = OSStatus # type: ignore[attr-defined] + + kSecRevocationUseAnyAvailableMethod = 3 + kSecRevocationRequirePositiveResponse = 8 + + CoreFoundation.CFRelease.argtypes = [CFTypeRef] + CoreFoundation.CFRelease.restype = None + + CoreFoundation.CFGetTypeID.argtypes = [CFTypeRef] + CoreFoundation.CFGetTypeID.restype = CFTypeID + + CoreFoundation.CFStringCreateWithCString.argtypes = [ + CFAllocatorRef, + c_char_p, + CFStringEncoding, + ] + CoreFoundation.CFStringCreateWithCString.restype = CFStringRef + + CoreFoundation.CFStringGetCStringPtr.argtypes = [CFStringRef, CFStringEncoding] + CoreFoundation.CFStringGetCStringPtr.restype = c_char_p + + CoreFoundation.CFStringGetCString.argtypes = [ + CFStringRef, + c_char_p, + CFIndex, + CFStringEncoding, + ] + CoreFoundation.CFStringGetCString.restype = c_bool + + CoreFoundation.CFDataCreate.argtypes = [CFAllocatorRef, c_char_p, CFIndex] + CoreFoundation.CFDataCreate.restype = CFDataRef + + CoreFoundation.CFDataGetLength.argtypes = [CFDataRef] + CoreFoundation.CFDataGetLength.restype = CFIndex + + CoreFoundation.CFDataGetBytePtr.argtypes = [CFDataRef] + CoreFoundation.CFDataGetBytePtr.restype = c_void_p + + CoreFoundation.CFArrayCreate.argtypes = [ + CFAllocatorRef, + POINTER(CFTypeRef), + CFIndex, + CFArrayCallBacks, + ] + CoreFoundation.CFArrayCreate.restype = CFArrayRef + + CoreFoundation.CFArrayCreateMutable.argtypes = [ + CFAllocatorRef, + CFIndex, + CFArrayCallBacks, + ] + CoreFoundation.CFArrayCreateMutable.restype = CFMutableArrayRef + + CoreFoundation.CFArrayAppendValue.argtypes = [CFMutableArrayRef, c_void_p] + CoreFoundation.CFArrayAppendValue.restype = None + + CoreFoundation.CFArrayGetCount.argtypes = [CFArrayRef] + CoreFoundation.CFArrayGetCount.restype = CFIndex + + CoreFoundation.CFArrayGetValueAtIndex.argtypes = [CFArrayRef, CFIndex] + CoreFoundation.CFArrayGetValueAtIndex.restype = c_void_p + + CoreFoundation.CFErrorGetCode.argtypes = [CFErrorRef] + CoreFoundation.CFErrorGetCode.restype = CFIndex + + CoreFoundation.CFErrorCopyDescription.argtypes = [CFErrorRef] + CoreFoundation.CFErrorCopyDescription.restype = CFStringRef + + CoreFoundation.kCFAllocatorDefault = CFAllocatorRef.in_dll( # type: ignore[attr-defined] + CoreFoundation, "kCFAllocatorDefault" + ) + CoreFoundation.kCFTypeArrayCallBacks = c_void_p.in_dll( # type: ignore[attr-defined] + CoreFoundation, "kCFTypeArrayCallBacks" + ) + + CoreFoundation.CFTypeRef = CFTypeRef # type: ignore[attr-defined] + CoreFoundation.CFArrayRef = CFArrayRef # type: ignore[attr-defined] + CoreFoundation.CFStringRef = CFStringRef # type: ignore[attr-defined] + CoreFoundation.CFErrorRef = CFErrorRef # type: ignore[attr-defined] + +except AttributeError as e: + raise ImportError(f"Error initializing ctypes: {e}") from None + +# SecTrustEvaluateWithError is macOS 10.14+ +if _is_macos_version_10_14_or_later: + try: + Security.SecTrustEvaluateWithError.argtypes = [ + SecTrustRef, + POINTER(CFErrorRef), + ] + Security.SecTrustEvaluateWithError.restype = c_bool + except AttributeError as e: + raise ImportError(f"Error initializing ctypes: {e}") from None + + +def _handle_osstatus(result: OSStatus, _: typing.Any, args: typing.Any) -> typing.Any: + """ + Raises an error if the OSStatus value is non-zero. + """ + if int(result) == 0: + return args + + # Returns a CFString which we need to transform + # into a UTF-8 Python string. + error_message_cfstring = None + try: + error_message_cfstring = Security.SecCopyErrorMessageString(result, None) + + # First step is convert the CFString into a C string pointer. + # We try the fast no-copy way first. + error_message_cfstring_c_void_p = ctypes.cast( + error_message_cfstring, ctypes.POINTER(ctypes.c_void_p) + ) + message = CoreFoundation.CFStringGetCStringPtr( + error_message_cfstring_c_void_p, CFConst.kCFStringEncodingUTF8 + ) + + # Quoting the Apple dev docs: + # + # "A pointer to a C string or NULL if the internal + # storage of theString does not allow this to be + # returned efficiently." + # + # So we need to get our hands dirty. + if message is None: + buffer = ctypes.create_string_buffer(1024) + result = CoreFoundation.CFStringGetCString( + error_message_cfstring_c_void_p, + buffer, + 1024, + CFConst.kCFStringEncodingUTF8, + ) + if not result: + raise OSError("Error copying C string from CFStringRef") + message = buffer.value + + finally: + if error_message_cfstring is not None: + CoreFoundation.CFRelease(error_message_cfstring) + + # If no message can be found for this status we come + # up with a generic one that forwards the status code. + if message is None or message == "": + message = f"SecureTransport operation returned a non-zero OSStatus: {result}" + + raise ssl.SSLError(message) + + +Security.SecTrustCreateWithCertificates.errcheck = _handle_osstatus # type: ignore[assignment] +Security.SecTrustSetAnchorCertificates.errcheck = _handle_osstatus # type: ignore[assignment] +Security.SecTrustSetAnchorCertificatesOnly.errcheck = _handle_osstatus # type: ignore[assignment] +Security.SecTrustGetTrustResult.errcheck = _handle_osstatus # type: ignore[assignment] +Security.SecTrustEvaluate.errcheck = _handle_osstatus # type: ignore[assignment] + + +class CFConst: + """CoreFoundation constants""" + + kCFStringEncodingUTF8 = CFStringEncoding(0x08000100) + + errSecIncompleteCertRevocationCheck = -67635 + errSecHostNameMismatch = -67602 + errSecCertificateExpired = -67818 + errSecNotTrusted = -67843 + + +def _bytes_to_cf_data_ref(value: bytes) -> CFDataRef: # type: ignore[valid-type] + return CoreFoundation.CFDataCreate( # type: ignore[no-any-return] + CoreFoundation.kCFAllocatorDefault, value, len(value) + ) + + +def _bytes_to_cf_string(value: bytes) -> CFString: + """ + Given a Python binary data, create a CFString. + The string must be CFReleased by the caller. + """ + c_str = ctypes.c_char_p(value) + cf_str = CoreFoundation.CFStringCreateWithCString( + CoreFoundation.kCFAllocatorDefault, + c_str, + CFConst.kCFStringEncodingUTF8, + ) + return cf_str # type: ignore[no-any-return] + + +def _cf_string_ref_to_str(cf_string_ref: CFStringRef) -> str | None: # type: ignore[valid-type] + """ + Creates a Unicode string from a CFString object. Used entirely for error + reporting. + Yes, it annoys me quite a lot that this function is this complex. + """ + + string = CoreFoundation.CFStringGetCStringPtr( + cf_string_ref, CFConst.kCFStringEncodingUTF8 + ) + if string is None: + buffer = ctypes.create_string_buffer(1024) + result = CoreFoundation.CFStringGetCString( + cf_string_ref, buffer, 1024, CFConst.kCFStringEncodingUTF8 + ) + if not result: + raise OSError("Error copying C string from CFStringRef") + string = buffer.value + if string is not None: + string = string.decode("utf-8") + return string # type: ignore[no-any-return] + + +def _der_certs_to_cf_cert_array(certs: list[bytes]) -> CFMutableArrayRef: # type: ignore[valid-type] + """Builds a CFArray of SecCertificateRefs from a list of DER-encoded certificates. + Responsibility of the caller to call CoreFoundation.CFRelease on the CFArray. + """ + cf_array = CoreFoundation.CFArrayCreateMutable( + CoreFoundation.kCFAllocatorDefault, + 0, + ctypes.byref(CoreFoundation.kCFTypeArrayCallBacks), + ) + if not cf_array: + raise MemoryError("Unable to allocate memory!") + + for cert_data in certs: + cf_data = None + sec_cert_ref = None + try: + cf_data = _bytes_to_cf_data_ref(cert_data) + sec_cert_ref = Security.SecCertificateCreateWithData( + CoreFoundation.kCFAllocatorDefault, cf_data + ) + CoreFoundation.CFArrayAppendValue(cf_array, sec_cert_ref) + finally: + if cf_data: + CoreFoundation.CFRelease(cf_data) + if sec_cert_ref: + CoreFoundation.CFRelease(sec_cert_ref) + + return cf_array # type: ignore[no-any-return] + + +@contextlib.contextmanager +def _configure_context(ctx: ssl.SSLContext) -> typing.Iterator[None]: + check_hostname = ctx.check_hostname + verify_mode = ctx.verify_mode + ctx.check_hostname = False + _set_ssl_context_verify_mode(ctx, ssl.CERT_NONE) + try: + yield + finally: + ctx.check_hostname = check_hostname + _set_ssl_context_verify_mode(ctx, verify_mode) + + +def _verify_peercerts_impl( + ssl_context: ssl.SSLContext, + cert_chain: list[bytes], + server_hostname: str | None = None, +) -> None: + certs = None + policies = None + trust = None + try: + # Only set a hostname on the policy if we're verifying the hostname + # on the leaf certificate. + if server_hostname is not None and ssl_context.check_hostname: + cf_str_hostname = None + try: + cf_str_hostname = _bytes_to_cf_string(server_hostname.encode("ascii")) + ssl_policy = Security.SecPolicyCreateSSL(True, cf_str_hostname) + finally: + if cf_str_hostname: + CoreFoundation.CFRelease(cf_str_hostname) + else: + ssl_policy = Security.SecPolicyCreateSSL(True, None) + + policies = ssl_policy + if ssl_context.verify_flags & ssl.VERIFY_CRL_CHECK_CHAIN: + # Add explicit policy requiring positive revocation checks + policies = CoreFoundation.CFArrayCreateMutable( + CoreFoundation.kCFAllocatorDefault, + 0, + ctypes.byref(CoreFoundation.kCFTypeArrayCallBacks), + ) + CoreFoundation.CFArrayAppendValue(policies, ssl_policy) + CoreFoundation.CFRelease(ssl_policy) + revocation_policy = Security.SecPolicyCreateRevocation( + kSecRevocationUseAnyAvailableMethod + | kSecRevocationRequirePositiveResponse + ) + CoreFoundation.CFArrayAppendValue(policies, revocation_policy) + CoreFoundation.CFRelease(revocation_policy) + elif ssl_context.verify_flags & ssl.VERIFY_CRL_CHECK_LEAF: + raise NotImplementedError("VERIFY_CRL_CHECK_LEAF not implemented for macOS") + + certs = None + try: + certs = _der_certs_to_cf_cert_array(cert_chain) + + # Now that we have certificates loaded and a SecPolicy + # we can finally create a SecTrust object! + trust = Security.SecTrustRef() + Security.SecTrustCreateWithCertificates( + certs, policies, ctypes.byref(trust) + ) + + finally: + # The certs are now being held by SecTrust so we can + # release our handles for the array. + if certs: + CoreFoundation.CFRelease(certs) + + # If there are additional trust anchors to load we need to transform + # the list of DER-encoded certificates into a CFArray. + ctx_ca_certs_der: list[bytes] | None = ssl_context.get_ca_certs( + binary_form=True + ) + if ctx_ca_certs_der: + ctx_ca_certs = None + try: + ctx_ca_certs = _der_certs_to_cf_cert_array(ctx_ca_certs_der) + Security.SecTrustSetAnchorCertificates(trust, ctx_ca_certs) + finally: + if ctx_ca_certs: + CoreFoundation.CFRelease(ctx_ca_certs) + + # We always want system certificates. + Security.SecTrustSetAnchorCertificatesOnly(trust, False) + + # macOS 10.13 and earlier don't support SecTrustEvaluateWithError() + # so we use SecTrustEvaluate() which means we need to construct error + # messages ourselves. + if _is_macos_version_10_14_or_later: + _verify_peercerts_impl_macos_10_14(ssl_context, trust) + else: + _verify_peercerts_impl_macos_10_13(ssl_context, trust) + finally: + if policies: + CoreFoundation.CFRelease(policies) + if trust: + CoreFoundation.CFRelease(trust) + + +def _verify_peercerts_impl_macos_10_13( + ssl_context: ssl.SSLContext, sec_trust_ref: typing.Any +) -> None: + """Verify using 'SecTrustEvaluate' API for macOS 10.13 and earlier. + macOS 10.14 added the 'SecTrustEvaluateWithError' API. + """ + sec_trust_result_type = Security.SecTrustResultType() + Security.SecTrustEvaluate(sec_trust_ref, ctypes.byref(sec_trust_result_type)) + + try: + sec_trust_result_type_as_int = int(sec_trust_result_type.value) + except (ValueError, TypeError): + sec_trust_result_type_as_int = -1 + + # Apple doesn't document these values in their own API docs. + # See: https://github.com/xybp888/iOS-SDKs/blob/master/iPhoneOS13.0.sdk/System/Library/Frameworks/Security.framework/Headers/SecTrust.h#L84 + if ( + ssl_context.verify_mode == ssl.CERT_REQUIRED + and sec_trust_result_type_as_int not in (1, 4) + ): + # Note that we're not able to ignore only hostname errors + # for macOS 10.13 and earlier, so check_hostname=False will + # still return an error. + sec_trust_result_type_to_message = { + 0: "Invalid trust result type", + # 1: "Trust evaluation succeeded", + 2: "User confirmation required", + 3: "User specified that certificate is not trusted", + # 4: "Trust result is unspecified", + 5: "Recoverable trust failure occurred", + 6: "Fatal trust failure occurred", + 7: "Other error occurred, certificate may be revoked", + } + error_message = sec_trust_result_type_to_message.get( + sec_trust_result_type_as_int, + f"Unknown trust result: {sec_trust_result_type_as_int}", + ) + + err = ssl.SSLCertVerificationError(error_message) + err.verify_message = error_message + err.verify_code = sec_trust_result_type_as_int + raise err + + +def _verify_peercerts_impl_macos_10_14( + ssl_context: ssl.SSLContext, sec_trust_ref: typing.Any +) -> None: + """Verify using 'SecTrustEvaluateWithError' API for macOS 10.14+.""" + cf_error = CoreFoundation.CFErrorRef() + sec_trust_eval_result = Security.SecTrustEvaluateWithError( + sec_trust_ref, ctypes.byref(cf_error) + ) + # sec_trust_eval_result is a bool (0 or 1) + # where 1 means that the certs are trusted. + if sec_trust_eval_result == 1: + is_trusted = True + elif sec_trust_eval_result == 0: + is_trusted = False + else: + raise ssl.SSLError( + f"Unknown result from Security.SecTrustEvaluateWithError: {sec_trust_eval_result!r}" + ) + + cf_error_code = 0 + if not is_trusted: + cf_error_code = CoreFoundation.CFErrorGetCode(cf_error) + + # If the error is a known failure that we're + # explicitly okay with from SSLContext configuration + # we can set is_trusted accordingly. + if ssl_context.verify_mode != ssl.CERT_REQUIRED and ( + cf_error_code == CFConst.errSecNotTrusted + or cf_error_code == CFConst.errSecCertificateExpired + ): + is_trusted = True + + # If we're still not trusted then we start to + # construct and raise the SSLCertVerificationError. + if not is_trusted: + cf_error_string_ref = None + try: + cf_error_string_ref = CoreFoundation.CFErrorCopyDescription(cf_error) + + # Can this ever return 'None' if there's a CFError? + cf_error_message = ( + _cf_string_ref_to_str(cf_error_string_ref) + or "Certificate verification failed" + ) + + # TODO: Not sure if we need the SecTrustResultType for anything? + # We only care whether or not it's a success or failure for now. + sec_trust_result_type = Security.SecTrustResultType() + Security.SecTrustGetTrustResult( + sec_trust_ref, ctypes.byref(sec_trust_result_type) + ) + + err = ssl.SSLCertVerificationError(cf_error_message) + err.verify_message = cf_error_message + err.verify_code = cf_error_code + raise err + finally: + if cf_error_string_ref: + CoreFoundation.CFRelease(cf_error_string_ref) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0919af6a4496fe653b4c0663ad1756f3d8b6ff8c Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h new file mode 100644 index 0000000000000000000000000000000000000000..b990507d629b4260d66d51e23a7f34a0fa465c9e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h @@ -0,0 +1,767 @@ +/* + pybind11/detail/class.h: Python C API implementation details for py::class_ + + Copyright (c) 2017 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include +#include + +#include "exception_translation.h" + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +#if !defined(PYPY_VERSION) +# define PYBIND11_BUILTIN_QUALNAME +# define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) +#else +// In PyPy, we still set __qualname__ so that we can produce reliable function type +// signatures; in CPython this macro expands to nothing: +# define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) \ + setattr((PyObject *) obj, "__qualname__", nameobj) +#endif + +inline std::string get_fully_qualified_tp_name(PyTypeObject *type) { +#if !defined(PYPY_VERSION) + return type->tp_name; +#else + auto module_name = handle((PyObject *) type).attr("__module__").cast(); + if (module_name == PYBIND11_BUILTINS_MODULE) + return type->tp_name; + else + return std::move(module_name) + "." + type->tp_name; +#endif +} + +inline PyTypeObject *type_incref(PyTypeObject *type) { + Py_INCREF(type); + return type; +} + +#if !defined(PYPY_VERSION) + +/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance. +extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) { + return PyProperty_Type.tp_descr_get(self, cls, cls); +} + +/// `pybind11_static_property.__set__()`: Just like the above `__get__()`. +extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) { + PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj); + return PyProperty_Type.tp_descr_set(self, cls, value); +} + +// Forward declaration to use in `make_static_property_type()` +inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type); + +/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()` + methods are modified to always use the object type instead of a concrete instance. + Return value: New reference. */ +inline PyTypeObject *make_static_property_type() { + constexpr auto *name = "pybind11_static_property"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0); + if (!heap_type) { + pybind11_fail("make_static_property_type(): error allocating type!"); + } + + heap_type->ht_name = name_obj.inc_ref().ptr(); +# ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +# endif + + auto *type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyProperty_Type); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + type->tp_descr_get = pybind11_static_get; + type->tp_descr_set = pybind11_static_set; + +# if PY_VERSION_HEX >= 0x030C0000 + // Since Python-3.12 property-derived types are required to + // have dynamic attributes (to set `__doc__`) + enable_dynamic_attributes(heap_type); +# endif + + if (PyType_Ready(type) < 0) { + pybind11_fail("make_static_property_type(): failure in PyType_Ready()!"); + } + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + return type; +} + +#else // PYPY + +/** PyPy has some issues with the above C API, so we evaluate Python code instead. + This function will only be called once so performance isn't really a concern. + Return value: New reference. */ +inline PyTypeObject *make_static_property_type() { + auto d = dict(); + PyObject *result = PyRun_String(R"(\ +class pybind11_static_property(property): + def __get__(self, obj, cls): + return property.__get__(self, cls, cls) + + def __set__(self, obj, value): + cls = obj if isinstance(obj, type) else type(obj) + property.__set__(self, cls, value) +)", + Py_file_input, + d.ptr(), + d.ptr()); + if (result == nullptr) + throw error_already_set(); + Py_DECREF(result); + return (PyTypeObject *) d["pybind11_static_property"].cast().release().ptr(); +} + +#endif // PYPY + +/** Types with static properties need to handle `Type.static_prop = x` in a specific way. + By default, Python replaces the `static_property` itself, but for wrapped C++ types + we need to call `static_property.__set__()` in order to propagate the new value to + the underlying C++ data structure. */ +extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name, PyObject *value) { + // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw + // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`). + PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name); + + // The following assignment combinations are possible: + // 1. `Type.static_prop = value` --> descr_set: `Type.static_prop.__set__(value)` + // 2. `Type.static_prop = other_static_prop` --> setattro: replace existing `static_prop` + // 3. `Type.regular_attribute = value` --> setattro: regular attribute assignment + auto *const static_prop = (PyObject *) get_internals().static_property_type; + const auto call_descr_set = (descr != nullptr) && (value != nullptr) + && (PyObject_IsInstance(descr, static_prop) != 0) + && (PyObject_IsInstance(value, static_prop) == 0); + if (call_descr_set) { + // Call `static_property.__set__()` instead of replacing the `static_property`. +#if !defined(PYPY_VERSION) + return Py_TYPE(descr)->tp_descr_set(descr, obj, value); +#else + if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) { + Py_DECREF(result); + return 0; + } else { + return -1; + } +#endif + } else { + // Replace existing attribute. + return PyType_Type.tp_setattro(obj, name, value); + } +} + +/** + * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing + * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function, + * when called on a class, or a PyMethod, when called on an instance. Override that behaviour here + * to do a special case bypass for PyInstanceMethod_Types. + */ +extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) { + PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name); + if (descr && PyInstanceMethod_Check(descr)) { + Py_INCREF(descr); + return descr; + } + return PyType_Type.tp_getattro(obj, name); +} + +/// metaclass `__call__` function that is used to create all pybind11 objects. +extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) { + + // use the default metaclass call to create/initialize the object + PyObject *self = PyType_Type.tp_call(type, args, kwargs); + if (self == nullptr) { + return nullptr; + } + + // Ensure that the base __init__ function(s) were called + values_and_holders vhs(self); + for (const auto &vh : vhs) { + if (!vh.holder_constructed() && !vhs.is_redundant_value_and_holder(vh)) { + PyErr_Format(PyExc_TypeError, + "%.200s.__init__() must be called when overriding __init__", + get_fully_qualified_tp_name(vh.type->type).c_str()); + Py_DECREF(self); + return nullptr; + } + } + + return self; +} + +/// Cleanup the type-info for a pybind11-registered type. +extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { + with_internals([obj](internals &internals) { + auto *type = (PyTypeObject *) obj; + + // A pybind11-registered type will: + // 1) be found in internals.registered_types_py + // 2) have exactly one associated `detail::type_info` + auto found_type = internals.registered_types_py.find(type); + if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1 + && found_type->second[0]->type == type) { + + auto *tinfo = found_type->second[0]; + auto tindex = std::type_index(*tinfo->cpptype); + internals.direct_conversions.erase(tindex); + + if (tinfo->module_local) { + get_local_internals().registered_types_cpp.erase(tindex); + } else { + internals.registered_types_cpp.erase(tindex); + } + internals.registered_types_py.erase(tinfo->type); + + // Actually just `std::erase_if`, but that's only available in C++20 + auto &cache = internals.inactive_override_cache; + for (auto it = cache.begin(), last = cache.end(); it != last;) { + if (it->first == (PyObject *) tinfo->type) { + it = cache.erase(it); + } else { + ++it; + } + } + + delete tinfo; + } + }); + + PyType_Type.tp_dealloc(obj); +} + +/** This metaclass is assigned by default to all pybind11 types and is required in order + for static properties to function correctly. Users may override this using `py::metaclass`. + Return value: New reference. */ +inline PyTypeObject *make_default_metaclass() { + constexpr auto *name = "pybind11_type"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0); + if (!heap_type) { + pybind11_fail("make_default_metaclass(): error allocating metaclass!"); + } + + heap_type->ht_name = name_obj.inc_ref().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +#endif + + auto *type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyType_Type); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + + type->tp_call = pybind11_meta_call; + + type->tp_setattro = pybind11_meta_setattro; + type->tp_getattro = pybind11_meta_getattro; + + type->tp_dealloc = pybind11_meta_dealloc; + + if (PyType_Ready(type) < 0) { + pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!"); + } + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + return type; +} + +/// For multiple inheritance types we need to recursively register/deregister base pointers for any +/// base classes with pointers that are difference from the instance value pointer so that we can +/// correctly recognize an offset base class pointer. This calls a function with any offset base +/// ptrs. +inline void traverse_offset_bases(void *valueptr, + const detail::type_info *tinfo, + instance *self, + bool (*f)(void * /*parentptr*/, instance * /*self*/)) { + for (handle h : reinterpret_borrow(tinfo->type->tp_bases)) { + if (auto *parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) { + for (auto &c : parent_tinfo->implicit_casts) { + if (c.first == tinfo->cpptype) { + auto *parentptr = c.second(valueptr); + if (parentptr != valueptr) { + f(parentptr, self); + } + traverse_offset_bases(parentptr, parent_tinfo, self, f); + break; + } + } + } + } +} + +inline bool register_instance_impl(void *ptr, instance *self) { + with_instance_map(ptr, [&](instance_map &instances) { instances.emplace(ptr, self); }); + return true; // unused, but gives the same signature as the deregister func +} +inline bool deregister_instance_impl(void *ptr, instance *self) { + return with_instance_map(ptr, [&](instance_map &instances) { + auto range = instances.equal_range(ptr); + for (auto it = range.first; it != range.second; ++it) { + if (self == it->second) { + instances.erase(it); + return true; + } + } + return false; + }); +} + +inline void register_instance(instance *self, void *valptr, const type_info *tinfo) { + register_instance_impl(valptr, self); + if (!tinfo->simple_ancestors) { + traverse_offset_bases(valptr, tinfo, self, register_instance_impl); + } +} + +inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) { + bool ret = deregister_instance_impl(valptr, self); + if (!tinfo->simple_ancestors) { + traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl); + } + return ret; +} + +/// Instance creation function for all pybind11 types. It allocates the internal instance layout +/// for holding C++ objects and holders. Allocation is done lazily (the first time the instance is +/// cast to a reference or pointer), and initialization is done by an `__init__` function. +inline PyObject *make_new_instance(PyTypeObject *type) { +#if defined(PYPY_VERSION) + // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first + // inherited object is a plain Python type (i.e. not derived from an extension type). Fix it. + ssize_t instance_size = static_cast(sizeof(instance)); + if (type->tp_basicsize < instance_size) { + type->tp_basicsize = instance_size; + } +#endif + PyObject *self = type->tp_alloc(type, 0); + auto *inst = reinterpret_cast(self); + // Allocate the value/holder internals: + inst->allocate_layout(); + + return self; +} + +/// Instance creation function for all pybind11 types. It only allocates space for the +/// C++ object, but doesn't call the constructor -- an `__init__` function must do that. +extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) { + return make_new_instance(type); +} + +/// An `__init__` function constructs the C++ object. Users should provide at least one +/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the +/// following default function will be used which simply throws an exception. +extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) { + PyTypeObject *type = Py_TYPE(self); + std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!"; + set_error(PyExc_TypeError, msg.c_str()); + return -1; +} + +inline void add_patient(PyObject *nurse, PyObject *patient) { + auto *instance = reinterpret_cast(nurse); + instance->has_patients = true; + Py_INCREF(patient); + + with_internals([&](internals &internals) { internals.patients[nurse].push_back(patient); }); +} + +inline void clear_patients(PyObject *self) { + auto *instance = reinterpret_cast(self); + std::vector patients; + + with_internals([&](internals &internals) { + auto pos = internals.patients.find(self); + + if (pos == internals.patients.end()) { + pybind11_fail( + "FATAL: Internal consistency check failed: Invalid clear_patients() call."); + } + + // Clearing the patients can cause more Python code to run, which + // can invalidate the iterator. Extract the vector of patients + // from the unordered_map first. + patients = std::move(pos->second); + internals.patients.erase(pos); + }); + + instance->has_patients = false; + for (PyObject *&patient : patients) { + Py_CLEAR(patient); + } +} + +/// Clears all internal data from the instance and removes it from registered instances in +/// preparation for deallocation. +inline void clear_instance(PyObject *self) { + auto *instance = reinterpret_cast(self); + + // Deallocate any values/holders, if present: + for (auto &v_h : values_and_holders(instance)) { + if (v_h) { + + // We have to deregister before we call dealloc because, for virtual MI types, we still + // need to be able to get the parent pointers. + if (v_h.instance_registered() + && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) { + pybind11_fail( + "pybind11_object_dealloc(): Tried to deallocate unregistered instance!"); + } + + if (instance->owned || v_h.holder_constructed()) { + v_h.type->dealloc(v_h); + } + } + } + // Deallocate the value/holder layout internals: + instance->deallocate_layout(); + + if (instance->weakrefs) { + PyObject_ClearWeakRefs(self); + } + + PyObject **dict_ptr = _PyObject_GetDictPtr(self); + if (dict_ptr) { + Py_CLEAR(*dict_ptr); + } + + if (instance->has_patients) { + clear_patients(self); + } +} + +/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc` +/// to destroy the C++ object itself, while the rest is Python bookkeeping. +extern "C" inline void pybind11_object_dealloc(PyObject *self) { + auto *type = Py_TYPE(self); + + // If this is a GC tracked object, untrack it first + // Note that the track call is implicitly done by the + // default tp_alloc, which we never override. + if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) { + PyObject_GC_UnTrack(self); + } + + clear_instance(self); + + type->tp_free(self); + +#if PY_VERSION_HEX < 0x03080000 + // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called + // as part of a derived type's dealloc, in which case we're not allowed to decref + // the type here. For cross-module compatibility, we shouldn't compare directly + // with `pybind11_object_dealloc`, but with the common one stashed in internals. + auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base; + if (type->tp_dealloc == pybind11_object_type->tp_dealloc) + Py_DECREF(type); +#else + // This was not needed before Python 3.8 (Python issue 35810) + // https://github.com/pybind/pybind11/issues/1946 + Py_DECREF(type); +#endif +} + +std::string error_string(); + +/** Create the type which can be used as a common base for all classes. This is + needed in order to satisfy Python's requirements for multiple inheritance. + Return value: New reference. */ +inline PyObject *make_object_base_type(PyTypeObject *metaclass) { + constexpr auto *name = "pybind11_object"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0); + if (!heap_type) { + pybind11_fail("make_object_base_type(): error allocating type!"); + } + + heap_type->ht_name = name_obj.inc_ref().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +#endif + + auto *type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyBaseObject_Type); + type->tp_basicsize = static_cast(sizeof(instance)); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + + type->tp_new = pybind11_object_new; + type->tp_init = pybind11_object_init; + type->tp_dealloc = pybind11_object_dealloc; + + /* Support weak references (needed for the keep_alive feature) */ + type->tp_weaklistoffset = offsetof(instance, weakrefs); + + if (PyType_Ready(type) < 0) { + pybind11_fail("PyType_Ready failed in make_object_base_type(): " + error_string()); + } + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)); + return (PyObject *) heap_type; +} + +/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`. +extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) { +#if PY_VERSION_HEX >= 0x030D0000 + PyObject_VisitManagedDict(self, visit, arg); +#else + PyObject *&dict = *_PyObject_GetDictPtr(self); + Py_VISIT(dict); +#endif +// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse +#if PY_VERSION_HEX >= 0x03090000 + Py_VISIT(Py_TYPE(self)); +#endif + return 0; +} + +/// dynamic_attr: Allow the GC to clear the dictionary. +extern "C" inline int pybind11_clear(PyObject *self) { +#if PY_VERSION_HEX >= 0x030D0000 + PyObject_ClearManagedDict(self); +#else + PyObject *&dict = *_PyObject_GetDictPtr(self); + Py_CLEAR(dict); +#endif + return 0; +} + +/// Give instances of this type a `__dict__` and opt into garbage collection. +inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) { + auto *type = &heap_type->ht_type; + type->tp_flags |= Py_TPFLAGS_HAVE_GC; +#if PY_VERSION_HEX < 0x030B0000 + type->tp_dictoffset = type->tp_basicsize; // place dict at the end + type->tp_basicsize += (ssize_t) sizeof(PyObject *); // and allocate enough space for it +#else + type->tp_flags |= Py_TPFLAGS_MANAGED_DICT; +#endif + type->tp_traverse = pybind11_traverse; + type->tp_clear = pybind11_clear; + + static PyGetSetDef getset[] + = {{"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr, nullptr}, + {nullptr, nullptr, nullptr, nullptr, nullptr}}; + type->tp_getset = getset; +} + +/// buffer_protocol: Fill in the view as specified by flags. +extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) { + // Look for a `get_buffer` implementation in this type's info or any bases (following MRO). + type_info *tinfo = nullptr; + for (auto type : reinterpret_borrow(Py_TYPE(obj)->tp_mro)) { + tinfo = get_type_info((PyTypeObject *) type.ptr()); + if (tinfo && tinfo->get_buffer) { + break; + } + } + if (view == nullptr || !tinfo || !tinfo->get_buffer) { + if (view) { + view->obj = nullptr; + } + set_error(PyExc_BufferError, "pybind11_getbuffer(): Internal error"); + return -1; + } + std::memset(view, 0, sizeof(Py_buffer)); + buffer_info *info = nullptr; + try { + info = tinfo->get_buffer(obj, tinfo->get_buffer_data); + } catch (...) { + try_translate_exceptions(); + raise_from(PyExc_BufferError, "Error getting buffer"); + return -1; + } + if (info == nullptr) { + pybind11_fail("FATAL UNEXPECTED SITUATION: tinfo->get_buffer() returned nullptr."); + } + + if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) { + delete info; + // view->obj = nullptr; // Was just memset to 0, so not necessary + set_error(PyExc_BufferError, "Writable buffer requested for readonly storage"); + return -1; + } + view->obj = obj; + view->ndim = 1; + view->internal = info; + view->buf = info->ptr; + view->itemsize = info->itemsize; + view->len = view->itemsize; + for (auto s : info->shape) { + view->len *= s; + } + view->readonly = static_cast(info->readonly); + if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) { + view->format = const_cast(info->format.c_str()); + } + if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) { + view->ndim = (int) info->ndim; + view->strides = info->strides.data(); + view->shape = info->shape.data(); + } + Py_INCREF(view->obj); + return 0; +} + +/// buffer_protocol: Release the resources of the buffer. +extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) { + delete (buffer_info *) view->internal; +} + +/// Give this type a buffer interface. +inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) { + heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer; + + heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer; + heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer; +} + +/** Create a brand new Python type according to the `type_record` specification. + Return value: New reference. */ +inline PyObject *make_new_python_type(const type_record &rec) { + auto name = reinterpret_steal(PYBIND11_FROM_STRING(rec.name)); + + auto qualname = name; + if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) { + qualname = reinterpret_steal( + PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr())); + } + + object module_; + if (rec.scope) { + if (hasattr(rec.scope, "__module__")) { + module_ = rec.scope.attr("__module__"); + } else if (hasattr(rec.scope, "__name__")) { + module_ = rec.scope.attr("__name__"); + } + } + + const auto *full_name = c_str( +#if !defined(PYPY_VERSION) + module_ ? str(module_).cast() + "." + rec.name : +#endif + rec.name); + + char *tp_doc = nullptr; + if (rec.doc && options::show_user_defined_docstrings()) { + /* Allocate memory for docstring (Python will free this later on) */ + size_t size = std::strlen(rec.doc) + 1; +#if PY_VERSION_HEX >= 0x030D0000 + tp_doc = (char *) PyMem_MALLOC(size); +#else + tp_doc = (char *) PyObject_MALLOC(size); +#endif + std::memcpy((void *) tp_doc, rec.doc, size); + } + + auto &internals = get_internals(); + auto bases = tuple(rec.bases); + auto *base = (bases.empty()) ? internals.instance_base : bases[0].ptr(); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto *metaclass + = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() : internals.default_metaclass; + + auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0); + if (!heap_type) { + pybind11_fail(std::string(rec.name) + ": Unable to create type object!"); + } + + heap_type->ht_name = name.release().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = qualname.inc_ref().ptr(); +#endif + + auto *type = &heap_type->ht_type; + type->tp_name = full_name; + type->tp_doc = tp_doc; + type->tp_base = type_incref((PyTypeObject *) base); + type->tp_basicsize = static_cast(sizeof(instance)); + if (!bases.empty()) { + type->tp_bases = bases.release().ptr(); + } + + /* Don't inherit base __init__ */ + type->tp_init = pybind11_object_init; + + /* Supported protocols */ + type->tp_as_number = &heap_type->as_number; + type->tp_as_sequence = &heap_type->as_sequence; + type->tp_as_mapping = &heap_type->as_mapping; + type->tp_as_async = &heap_type->as_async; + + /* Flags */ + type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE; + if (!rec.is_final) { + type->tp_flags |= Py_TPFLAGS_BASETYPE; + } + + if (rec.dynamic_attr) { + enable_dynamic_attributes(heap_type); + } + + if (rec.buffer_protocol) { + enable_buffer_protocol(heap_type); + } + + if (rec.custom_type_setup_callback) { + rec.custom_type_setup_callback(heap_type); + } + + if (PyType_Ready(type) < 0) { + pybind11_fail(std::string(rec.name) + ": PyType_Ready failed: " + error_string()); + } + + assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)); + + /* Register type with the parent scope */ + if (rec.scope) { + setattr(rec.scope, rec.name, (PyObject *) type); + } else { + Py_INCREF(type); // Keep it alive forever (reference leak) + } + + if (module_) { // Needed by pydoc + setattr((PyObject *) type, "__module__", module_); + } + + PYBIND11_SET_OLDPY_QUALNAME(type, qualname); + + return (PyObject *) type; +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h new file mode 100644 index 0000000000000000000000000000000000000000..7d546311e78f76f92af13d745b84c3664228fdc9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h @@ -0,0 +1,172 @@ +/* + pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "common.h" + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +#if !defined(_MSC_VER) +# define PYBIND11_DESCR_CONSTEXPR static constexpr +#else +# define PYBIND11_DESCR_CONSTEXPR const +#endif + +/* Concatenate type signatures at compile time */ +template +struct descr { + char text[N + 1]{'\0'}; + + constexpr descr() = default; + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr descr(char const (&s)[N + 1]) : descr(s, make_index_sequence()) {} + + template + constexpr descr(char const (&s)[N + 1], index_sequence) : text{s[Is]..., '\0'} {} + + template + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr descr(char c, Chars... cs) : text{c, static_cast(cs)..., '\0'} {} + + static constexpr std::array types() { + return {{&typeid(Ts)..., nullptr}}; + } +}; + +template +constexpr descr plus_impl(const descr &a, + const descr &b, + index_sequence, + index_sequence) { + PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(b); + return {a.text[Is1]..., b.text[Is2]...}; +} + +template +constexpr descr operator+(const descr &a, + const descr &b) { + return plus_impl(a, b, make_index_sequence(), make_index_sequence()); +} + +template +constexpr descr const_name(char const (&text)[N]) { + return descr(text); +} +constexpr descr<0> const_name(char const (&)[1]) { return {}; } + +template +struct int_to_str : int_to_str {}; +template +struct int_to_str<0, Digits...> { + // WARNING: This only works with C++17 or higher. + static constexpr auto digits = descr(('0' + Digits)...); +}; + +// Ternary description (like std::conditional) +template +constexpr enable_if_t> const_name(char const (&text1)[N1], char const (&)[N2]) { + return const_name(text1); +} +template +constexpr enable_if_t> const_name(char const (&)[N1], char const (&text2)[N2]) { + return const_name(text2); +} + +template +constexpr enable_if_t const_name(const T1 &d, const T2 &) { + return d; +} +template +constexpr enable_if_t const_name(const T1 &, const T2 &d) { + return d; +} + +template +auto constexpr const_name() -> remove_cv_t::digits)> { + return int_to_str::digits; +} + +template +constexpr descr<1, Type> const_name() { + return {'%'}; +} + +// If "_" is defined as a macro, py::detail::_ cannot be provided. +// It is therefore best to use py::detail::const_name universally. +// This block is for backward compatibility only. +// (The const_name code is repeated to avoid introducing a "_" #define ourselves.) +#ifndef _ +# define PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY +template +constexpr descr _(char const (&text)[N]) { + return const_name(text); +} +template +constexpr enable_if_t> _(char const (&text1)[N1], char const (&text2)[N2]) { + return const_name(text1, text2); +} +template +constexpr enable_if_t> _(char const (&text1)[N1], char const (&text2)[N2]) { + return const_name(text1, text2); +} +template +constexpr enable_if_t _(const T1 &d1, const T2 &d2) { + return const_name(d1, d2); +} +template +constexpr enable_if_t _(const T1 &d1, const T2 &d2) { + return const_name(d1, d2); +} + +template +auto constexpr _() -> remove_cv_t::digits)> { + return const_name(); +} +template +constexpr descr<1, Type> _() { + return const_name(); +} +#endif // #ifndef _ + +constexpr descr<0> concat() { return {}; } + +template +constexpr descr concat(const descr &descr) { + return descr; +} + +#ifdef __cpp_fold_expressions +template +constexpr descr operator,(const descr &a, + const descr &b) { + return a + const_name(", ") + b; +} + +template +constexpr auto concat(const descr &d, const Args &...args) { + return (d, ..., args); +} +#else +template +constexpr auto concat(const descr &d, + const Args &...args) -> decltype(std::declval>() + + concat(args...)) { + return d + const_name(", ") + concat(args...); +} +#endif + +template +constexpr descr type_descr(const descr &descr) { + return const_name("{") + descr + const_name("}"); +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h new file mode 100644 index 0000000000000000000000000000000000000000..2764180bb078c4ed2b9648de9d6752040256d295 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h @@ -0,0 +1,71 @@ +/* + pybind11/detail/exception_translation.h: means to translate C++ exceptions to Python exceptions + + Copyright (c) 2024 The Pybind Development Team. + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "common.h" +#include "internals.h" + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +// Apply all the extensions translators from a list +// Return true if one of the translators completed without raising an exception +// itself. Return of false indicates that if there are other translators +// available, they should be tried. +inline bool apply_exception_translators(std::forward_list &translators) { + auto last_exception = std::current_exception(); + + for (auto &translator : translators) { + try { + translator(last_exception); + return true; + } catch (...) { + last_exception = std::current_exception(); + } + } + return false; +} + +inline void try_translate_exceptions() { + /* When an exception is caught, give each registered exception + translator a chance to translate it to a Python exception. First + all module-local translators will be tried in reverse order of + registration. If none of the module-locale translators handle + the exception (or there are no module-locale translators) then + the global translators will be tried, also in reverse order of + registration. + + A translator may choose to do one of the following: + + - catch the exception and call py::set_error() + to set a standard (or custom) Python exception, or + - do nothing and let the exception fall through to the next translator, or + - delegate translation to the next translator by throwing a new type of exception. + */ + + bool handled = with_internals([&](internals &internals) { + auto &local_exception_translators = get_local_internals().registered_exception_translators; + if (detail::apply_exception_translators(local_exception_translators)) { + return true; + } + auto &exception_translators = internals.registered_exception_translators; + if (detail::apply_exception_translators(exception_translators)) { + return true; + } + return false; + }); + + if (!handled) { + set_error(PyExc_SystemError, "Exception escaped from default exception translator!"); + } +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h new file mode 100644 index 0000000000000000000000000000000000000000..e40e44ba6cc9a92d0e5a697a2478477f78320386 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h @@ -0,0 +1,1195 @@ +/* + pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h) + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include + +#include "common.h" +#include "cpp_conduit.h" +#include "descr.h" +#include "internals.h" +#include "typeid.h" +#include "value_and_holder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +/// A life support system for temporary objects created by `type_caster::load()`. +/// Adding a patient will keep it alive up until the enclosing function returns. +class loader_life_support { +private: + loader_life_support *parent = nullptr; + std::unordered_set keep_alive; + + // Store stack pointer in thread-local storage. + static PYBIND11_TLS_KEY_REF get_stack_tls_key() { +#if PYBIND11_INTERNALS_VERSION == 4 + return get_local_internals().loader_life_support_tls_key; +#else + return get_internals().loader_life_support_tls_key; +#endif + } + static loader_life_support *get_stack_top() { + return static_cast(PYBIND11_TLS_GET_VALUE(get_stack_tls_key())); + } + static void set_stack_top(loader_life_support *value) { + PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value); + } + +public: + /// A new patient frame is created when a function is entered + loader_life_support() : parent{get_stack_top()} { set_stack_top(this); } + + /// ... and destroyed after it returns + ~loader_life_support() { + if (get_stack_top() != this) { + pybind11_fail("loader_life_support: internal error"); + } + set_stack_top(parent); + for (auto *item : keep_alive) { + Py_DECREF(item); + } + } + + /// This can only be used inside a pybind11-bound function, either by `argument_loader` + /// at argument preparation time or by `py::cast()` at execution time. + PYBIND11_NOINLINE static void add_patient(handle h) { + loader_life_support *frame = get_stack_top(); + if (!frame) { + // NOTE: It would be nice to include the stack frames here, as this indicates + // use of pybind11::cast<> outside the normal call framework, finding such + // a location is challenging. Developers could consider printing out + // stack frame addresses here using something like __builtin_frame_address(0) + throw cast_error("When called outside a bound function, py::cast() cannot " + "do Python -> C++ conversions which require the creation " + "of temporary values"); + } + + if (frame->keep_alive.insert(h.ptr()).second) { + Py_INCREF(h.ptr()); + } + } +}; + +// Gets the cache entry for the given type, creating it if necessary. The return value is the pair +// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was +// just created. +inline std::pair +all_type_info_get_cache(PyTypeObject *type); + +// Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762. +inline void all_type_info_add_base_most_derived_first(std::vector &bases, + type_info *addl_base) { + for (auto it = bases.begin(); it != bases.end(); it++) { + type_info *existing_base = *it; + if (PyType_IsSubtype(addl_base->type, existing_base->type) != 0) { + bases.insert(it, addl_base); + return; + } + } + bases.push_back(addl_base); +} + +// Populates a just-created cache entry. +PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector &bases) { + assert(bases.empty()); + std::vector check; + for (handle parent : reinterpret_borrow(t->tp_bases)) { + check.push_back((PyTypeObject *) parent.ptr()); + } + + auto const &type_dict = get_internals().registered_types_py; + for (size_t i = 0; i < check.size(); i++) { + auto *type = check[i]; + // Ignore Python2 old-style class super type: + if (!PyType_Check((PyObject *) type)) { + continue; + } + + // Check `type` in the current set of registered python types: + auto it = type_dict.find(type); + if (it != type_dict.end()) { + // We found a cache entry for it, so it's either pybind-registered or has pre-computed + // pybind bases, but we have to make sure we haven't already seen the type(s) before: + // we want to follow Python/virtual C++ rules that there should only be one instance of + // a common base. + for (auto *tinfo : it->second) { + // NB: Could use a second set here, rather than doing a linear search, but since + // having a large number of immediate pybind11-registered types seems fairly + // unlikely, that probably isn't worthwhile. + bool found = false; + for (auto *known : bases) { + if (known == tinfo) { + found = true; + break; + } + } + if (!found) { + all_type_info_add_base_most_derived_first(bases, tinfo); + } + } + } else if (type->tp_bases) { + // It's some python type, so keep follow its bases classes to look for one or more + // registered types + if (i + 1 == check.size()) { + // When we're at the end, we can pop off the current element to avoid growing + // `check` when adding just one base (which is typical--i.e. when there is no + // multiple inheritance) + check.pop_back(); + i--; + } + for (handle parent : reinterpret_borrow(type->tp_bases)) { + check.push_back((PyTypeObject *) parent.ptr()); + } + } + } +} + +/** + * Extracts vector of type_info pointers of pybind-registered roots of the given Python type. Will + * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side + * derived class that uses single inheritance. Will contain as many types as required for a Python + * class that uses multiple inheritance to inherit (directly or indirectly) from multiple + * pybind-registered classes. Will be empty if neither the type nor any base classes are + * pybind-registered. + * + * The value is cached for the lifetime of the Python type. + */ +inline const std::vector &all_type_info(PyTypeObject *type) { + auto ins = all_type_info_get_cache(type); + if (ins.second) { + // New cache entry: populate it + all_type_info_populate(type, ins.first->second); + } + + return ins.first->second; +} + +/** + * Gets a single pybind11 type info for a python type. Returns nullptr if neither the type nor any + * ancestors are pybind11-registered. Throws an exception if there are multiple bases--use + * `all_type_info` instead if you want to support multiple bases. + */ +PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) { + const auto &bases = all_type_info(type); + if (bases.empty()) { + return nullptr; + } + if (bases.size() > 1) { + pybind11_fail( + "pybind11::detail::get_type_info: type has multiple pybind11-registered bases"); + } + return bases.front(); +} + +inline detail::type_info *get_local_type_info(const std::type_index &tp) { + auto &locals = get_local_internals().registered_types_cpp; + auto it = locals.find(tp); + if (it != locals.end()) { + return it->second; + } + return nullptr; +} + +inline detail::type_info *get_global_type_info(const std::type_index &tp) { + return with_internals([&](internals &internals) { + detail::type_info *type_info = nullptr; + auto &types = internals.registered_types_cpp; + auto it = types.find(tp); + if (it != types.end()) { + type_info = it->second; + } + return type_info; + }); +} + +/// Return the type info for a given C++ type; on lookup failure can either throw or return +/// nullptr. +PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp, + bool throw_if_missing = false) { + if (auto *ltype = get_local_type_info(tp)) { + return ltype; + } + if (auto *gtype = get_global_type_info(tp)) { + return gtype; + } + + if (throw_if_missing) { + std::string tname = tp.name(); + detail::clean_type_id(tname); + pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + + std::move(tname) + '"'); + } + return nullptr; +} + +PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) { + detail::type_info *type_info = get_type_info(tp, throw_if_missing); + return handle(type_info ? ((PyObject *) type_info->type) : nullptr); +} + +// Searches the inheritance graph for a registered Python instance, using all_type_info(). +PYBIND11_NOINLINE handle find_registered_python_instance(void *src, + const detail::type_info *tinfo) { + return with_instance_map(src, [&](instance_map &instances) { + auto it_instances = instances.equal_range(src); + for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) { + for (auto *instance_type : detail::all_type_info(Py_TYPE(it_i->second))) { + if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) { + return handle((PyObject *) it_i->second).inc_ref(); + } + } + } + return handle(); + }); +} + +// Container for accessing and iterating over an instance's values/holders +struct values_and_holders { +private: + instance *inst; + using type_vec = std::vector; + const type_vec &tinfo; + +public: + explicit values_and_holders(instance *inst) + : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {} + + explicit values_and_holders(PyObject *obj) + : inst{nullptr}, tinfo(all_type_info(Py_TYPE(obj))) { + if (!tinfo.empty()) { + inst = reinterpret_cast(obj); + } + } + + struct iterator { + private: + instance *inst = nullptr; + const type_vec *types = nullptr; + value_and_holder curr; + friend struct values_and_holders; + iterator(instance *inst, const type_vec *tinfo) : inst{inst}, types{tinfo} { + if (inst != nullptr) { + assert(!types->empty()); + curr = value_and_holder( + inst /* instance */, + (*types)[0] /* type info */, + 0, /* vpos: (non-simple types only): the first vptr comes first */ + 0 /* index */); + } + } + // Past-the-end iterator: + explicit iterator(size_t end) : curr(end) {} + + public: + bool operator==(const iterator &other) const { return curr.index == other.curr.index; } + bool operator!=(const iterator &other) const { return curr.index != other.curr.index; } + iterator &operator++() { + if (!inst->simple_layout) { + curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs; + } + ++curr.index; + curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr; + return *this; + } + value_and_holder &operator*() { return curr; } + value_and_holder *operator->() { return &curr; } + }; + + iterator begin() { return iterator(inst, &tinfo); } + iterator end() { return iterator(tinfo.size()); } + + iterator find(const type_info *find_type) { + auto it = begin(), endit = end(); + while (it != endit && it->type != find_type) { + ++it; + } + return it; + } + + size_t size() { return tinfo.size(); } + + // Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762. + bool is_redundant_value_and_holder(const value_and_holder &vh) { + for (size_t i = 0; i < vh.index; i++) { + if (PyType_IsSubtype(tinfo[i]->type, tinfo[vh.index]->type) != 0) { + return true; + } + } + return false; + } +}; + +/** + * Extracts C++ value and holder pointer references from an instance (which may contain multiple + * values/holders for python-side multiple inheritance) that match the given type. Throws an error + * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance. If + * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned, + * regardless of type (and the resulting .type will be nullptr). + * + * The returned object should be short-lived: in particular, it must not outlive the called-upon + * instance. + */ +PYBIND11_NOINLINE value_and_holder +instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, + bool throw_if_missing /*= true in common.h*/) { + // Optimize common case: + if (!find_type || Py_TYPE(this) == find_type->type) { + return value_and_holder(this, find_type, 0, 0); + } + + detail::values_and_holders vhs(this); + auto it = vhs.find(find_type); + if (it != vhs.end()) { + return *it; + } + + if (!throw_if_missing) { + return value_and_holder(); + } + +#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" + + get_fully_qualified_tp_name(find_type->type) + + "' is not a pybind11 base of the given `" + + get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance"); +#else + pybind11_fail( + "pybind11::detail::instance::get_value_and_holder: " + "type is not a pybind11 base of the given instance " + "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for type details)"); +#endif +} + +PYBIND11_NOINLINE void instance::allocate_layout() { + const auto &tinfo = all_type_info(Py_TYPE(this)); + + const size_t n_types = tinfo.size(); + + if (n_types == 0) { + pybind11_fail( + "instance allocation failed: new instance has no pybind11-registered base types"); + } + + simple_layout + = n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs(); + + // Simple path: no python-side multiple inheritance, and a small-enough holder + if (simple_layout) { + simple_value_holder[0] = nullptr; + simple_holder_constructed = false; + simple_instance_registered = false; + } else { // multiple base types or a too-large holder + // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer, + // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool + // values that tracks whether each associated holder has been initialized. Each [block] is + // padded, if necessary, to an integer multiple of sizeof(void *). + size_t space = 0; + for (auto *t : tinfo) { + space += 1; // value pointer + space += t->holder_size_in_ptrs; // holder instance + } + size_t flags_at = space; + space += size_in_ptrs(n_types); // status bytes (holder_constructed and + // instance_registered) + + // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values, + // in particular, need to be 0). Use Python's memory allocation + // functions: Python is using pymalloc, which is designed to be + // efficient for small allocations like the one we're doing here; + // for larger allocations they are just wrappers around malloc. + // TODO: is this still true for pure Python 3.6? + nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *)); + if (!nonsimple.values_and_holders) { + throw std::bad_alloc(); + } + nonsimple.status + = reinterpret_cast(&nonsimple.values_and_holders[flags_at]); + } + owned = true; +} + +// NOLINTNEXTLINE(readability-make-member-function-const) +PYBIND11_NOINLINE void instance::deallocate_layout() { + if (!simple_layout) { + PyMem_Free(reinterpret_cast(nonsimple.values_and_holders)); + } +} + +PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) { + handle type = detail::get_type_handle(tp, false); + if (!type) { + return false; + } + return isinstance(obj, type); +} + +PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type) { + return with_instance_map(ptr, [&](instance_map &instances) { + auto range = instances.equal_range(ptr); + for (auto it = range.first; it != range.second; ++it) { + for (const auto &vh : values_and_holders(it->second)) { + if (vh.type == type) { + return handle((PyObject *) it->second); + } + } + } + return handle(); + }); +} + +inline PyThreadState *get_thread_state_unchecked() { +#if defined(PYPY_VERSION) + return PyThreadState_GET(); +#elif PY_VERSION_HEX < 0x030D0000 + return _PyThreadState_UncheckedGet(); +#else + return PyThreadState_GetUnchecked(); +#endif +} + +// Forward declarations +void keep_alive_impl(handle nurse, handle patient); +inline PyObject *make_new_instance(PyTypeObject *type); + +class type_caster_generic { +public: + PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info) + : typeinfo(get_type_info(type_info)), cpptype(&type_info) {} + + explicit type_caster_generic(const type_info *typeinfo) + : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {} + + bool load(handle src, bool convert) { return load_impl(src, convert); } + + PYBIND11_NOINLINE static handle cast(const void *_src, + return_value_policy policy, + handle parent, + const detail::type_info *tinfo, + void *(*copy_constructor)(const void *), + void *(*move_constructor)(const void *), + const void *existing_holder = nullptr) { + if (!tinfo) { // no type info: error will be set already + return handle(); + } + + void *src = const_cast(_src); + if (src == nullptr) { + return none().release(); + } + + if (handle registered_inst = find_registered_python_instance(src, tinfo)) { + return registered_inst; + } + + auto inst = reinterpret_steal(make_new_instance(tinfo->type)); + auto *wrapper = reinterpret_cast(inst.ptr()); + wrapper->owned = false; + void *&valueptr = values_and_holders(wrapper).begin()->value_ptr(); + + switch (policy) { + case return_value_policy::automatic: + case return_value_policy::take_ownership: + valueptr = src; + wrapper->owned = true; + break; + + case return_value_policy::automatic_reference: + case return_value_policy::reference: + valueptr = src; + wrapper->owned = false; + break; + + case return_value_policy::copy: + if (copy_constructor) { + valueptr = copy_constructor(src); + } else { +#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + std::string type_name(tinfo->cpptype->name()); + detail::clean_type_id(type_name); + throw cast_error("return_value_policy = copy, but type " + type_name + + " is non-copyable!"); +#else + throw cast_error("return_value_policy = copy, but type is " + "non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or " + "compile in debug mode for details)"); +#endif + } + wrapper->owned = true; + break; + + case return_value_policy::move: + if (move_constructor) { + valueptr = move_constructor(src); + } else if (copy_constructor) { + valueptr = copy_constructor(src); + } else { +#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + std::string type_name(tinfo->cpptype->name()); + detail::clean_type_id(type_name); + throw cast_error("return_value_policy = move, but type " + type_name + + " is neither movable nor copyable!"); +#else + throw cast_error("return_value_policy = move, but type is neither " + "movable nor copyable! " + "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in " + "debug mode for details)"); +#endif + } + wrapper->owned = true; + break; + + case return_value_policy::reference_internal: + valueptr = src; + wrapper->owned = false; + keep_alive_impl(inst, parent); + break; + + default: + throw cast_error("unhandled return_value_policy: should not happen!"); + } + + tinfo->init_instance(wrapper, existing_holder); + + return inst.release(); + } + + // Base methods for generic caster; there are overridden in copyable_holder_caster + void load_value(value_and_holder &&v_h) { + auto *&vptr = v_h.value_ptr(); + // Lazy allocation for unallocated values: + if (vptr == nullptr) { + const auto *type = v_h.type ? v_h.type : typeinfo; + if (type->operator_new) { + vptr = type->operator_new(type->type_size); + } else { +#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912) + if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) { + vptr = ::operator new(type->type_size, std::align_val_t(type->type_align)); + } else { + vptr = ::operator new(type->type_size); + } +#else + vptr = ::operator new(type->type_size); +#endif + } + } + value = vptr; + } + bool try_implicit_casts(handle src, bool convert) { + for (const auto &cast : typeinfo->implicit_casts) { + type_caster_generic sub_caster(*cast.first); + if (sub_caster.load(src, convert)) { + value = cast.second(sub_caster.value); + return true; + } + } + return false; + } + bool try_direct_conversions(handle src) { + for (auto &converter : *typeinfo->direct_conversions) { + if (converter(src.ptr(), value)) { + return true; + } + } + return false; + } + bool try_cpp_conduit(handle src) { + value = try_raw_pointer_ephemeral_from_cpp_conduit(src, cpptype); + if (value != nullptr) { + return true; + } + return false; + } + void check_holder_compat() {} + + PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) { + auto caster = type_caster_generic(ti); + if (caster.load(src, false)) { + return caster.value; + } + return nullptr; + } + + /// Try to load with foreign typeinfo, if available. Used when there is no + /// native typeinfo, or when the native one wasn't able to produce a value. + PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) { + constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID; + const auto pytype = type::handle_of(src); + if (!hasattr(pytype, local_key)) { + return false; + } + + type_info *foreign_typeinfo = reinterpret_borrow(getattr(pytype, local_key)); + // Only consider this foreign loader if actually foreign and is a loader of the correct cpp + // type + if (foreign_typeinfo->module_local_load == &local_load + || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) { + return false; + } + + if (auto *result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) { + value = result; + return true; + } + return false; + } + + // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant + // bits of code between here and copyable_holder_caster where the two classes need different + // logic (without having to resort to virtual inheritance). + template + PYBIND11_NOINLINE bool load_impl(handle src, bool convert) { + if (!src) { + return false; + } + if (!typeinfo) { + return try_load_foreign_module_local(src); + } + + auto &this_ = static_cast(*this); + this_.check_holder_compat(); + + PyTypeObject *srctype = Py_TYPE(src.ptr()); + + // Case 1: If src is an exact type match for the target type then we can reinterpret_cast + // the instance's value pointer to the target type: + if (srctype == typeinfo->type) { + this_.load_value(reinterpret_cast(src.ptr())->get_value_and_holder()); + return true; + } + // Case 2: We have a derived class + if (PyType_IsSubtype(srctype, typeinfo->type)) { + const auto &bases = all_type_info(srctype); + bool no_cpp_mi = typeinfo->simple_type; + + // Case 2a: the python type is a Python-inherited derived class that inherits from just + // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of + // the right type and we can use reinterpret_cast. + // (This is essentially the same as case 2b, but because not using multiple inheritance + // is extremely common, we handle it specially to avoid the loop iterator and type + // pointer lookup overhead) + if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) { + this_.load_value(reinterpret_cast(src.ptr())->get_value_and_holder()); + return true; + } + // Case 2b: the python type inherits from multiple C++ bases. Check the bases to see + // if we can find an exact match (or, for a simple C++ type, an inherited match); if + // so, we can safely reinterpret_cast to the relevant pointer. + if (bases.size() > 1) { + for (auto *base : bases) { + if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) + : base->type == typeinfo->type) { + this_.load_value( + reinterpret_cast(src.ptr())->get_value_and_holder(base)); + return true; + } + } + } + + // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type + // match in the registered bases, above, so try implicit casting (needed for proper C++ + // casting when MI is involved). + if (this_.try_implicit_casts(src, convert)) { + return true; + } + } + + // Perform an implicit conversion + if (convert) { + for (const auto &converter : typeinfo->implicit_conversions) { + auto temp = reinterpret_steal(converter(src.ptr(), typeinfo->type)); + if (load_impl(temp, false)) { + loader_life_support::add_patient(temp); + return true; + } + } + if (this_.try_direct_conversions(src)) { + return true; + } + } + + // Failed to match local typeinfo. Try again with global. + if (typeinfo->module_local) { + if (auto *gtype = get_global_type_info(*typeinfo->cpptype)) { + typeinfo = gtype; + return load(src, false); + } + } + + // Global typeinfo has precedence over foreign module_local + if (try_load_foreign_module_local(src)) { + return true; + } + + // Custom converters didn't take None, now we convert None to nullptr. + if (src.is_none()) { + // Defer accepting None to other overloads (if we aren't in convert mode): + if (!convert) { + return false; + } + value = nullptr; + return true; + } + + if (convert && cpptype && this_.try_cpp_conduit(src)) { + return true; + } + + return false; + } + + // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast + // isn't needed or can't be used. If the type is unknown, sets the error and returns a pair + // with .second = nullptr. (p.first = nullptr is not an error: it becomes None). + PYBIND11_NOINLINE static std::pair + src_and_type(const void *src, + const std::type_info &cast_type, + const std::type_info *rtti_type = nullptr) { + if (auto *tpi = get_type_info(cast_type)) { + return {src, const_cast(tpi)}; + } + + // Not found, set error: + std::string tname = rtti_type ? rtti_type->name() : cast_type.name(); + detail::clean_type_id(tname); + std::string msg = "Unregistered type : " + tname; + set_error(PyExc_TypeError, msg.c_str()); + return {nullptr, nullptr}; + } + + const type_info *typeinfo = nullptr; + const std::type_info *cpptype = nullptr; + void *value = nullptr; +}; + +inline object cpp_conduit_method(handle self, + const bytes &pybind11_platform_abi_id, + const capsule &cpp_type_info_capsule, + const bytes &pointer_kind) { +#ifdef PYBIND11_HAS_STRING_VIEW + using cpp_str = std::string_view; +#else + using cpp_str = std::string; +#endif + if (cpp_str(pybind11_platform_abi_id) != PYBIND11_PLATFORM_ABI_ID) { + return none(); + } + if (std::strcmp(cpp_type_info_capsule.name(), typeid(std::type_info).name()) != 0) { + return none(); + } + if (cpp_str(pointer_kind) != "raw_pointer_ephemeral") { + throw std::runtime_error("Invalid pointer_kind: \"" + std::string(pointer_kind) + "\""); + } + const auto *cpp_type_info = cpp_type_info_capsule.get_pointer(); + type_caster_generic caster(*cpp_type_info); + if (!caster.load(self, false)) { + return none(); + } + return capsule(caster.value, cpp_type_info->name()); +} + +/** + * Determine suitable casting operator for pointer-or-lvalue-casting type casters. The type caster + * needs to provide `operator T*()` and `operator T&()` operators. + * + * If the type supports moving the value away via an `operator T&&() &&` method, it should use + * `movable_cast_op_type` instead. + */ +template +using cast_op_type = conditional_t>::value, + typename std::add_pointer>::type, + typename std::add_lvalue_reference>::type>; + +/** + * Determine suitable casting operator for a type caster with a movable value. Such a type caster + * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`. The latter will be + * called in appropriate contexts where the value can be moved rather than copied. + * + * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro. + */ +template +using movable_cast_op_type + = conditional_t::type>::value, + typename std::add_pointer>::type, + conditional_t::value, + typename std::add_rvalue_reference>::type, + typename std::add_lvalue_reference>::type>>; + +// Does the container have a mapped type and is it recursive? +// Implemented by specializations below. +template +struct container_mapped_type_traits { + static constexpr bool has_mapped_type = false; + static constexpr bool has_recursive_mapped_type = false; +}; + +template +struct container_mapped_type_traits< + Container, + typename std::enable_if< + std::is_same::value>::type> { + static constexpr bool has_mapped_type = true; + static constexpr bool has_recursive_mapped_type = true; +}; + +template +struct container_mapped_type_traits< + Container, + typename std::enable_if< + negation>::value>::type> { + static constexpr bool has_mapped_type = true; + static constexpr bool has_recursive_mapped_type = false; +}; + +// Does the container have a value type and is it recursive? +// Implemented by specializations below. +template +struct container_value_type_traits : std::false_type { + static constexpr bool has_value_type = false; + static constexpr bool has_recursive_value_type = false; +}; + +template +struct container_value_type_traits< + Container, + typename std::enable_if< + std::is_same::value>::type> { + static constexpr bool has_value_type = true; + static constexpr bool has_recursive_value_type = true; +}; + +template +struct container_value_type_traits< + Container, + typename std::enable_if< + negation>::value>::type> { + static constexpr bool has_value_type = true; + static constexpr bool has_recursive_value_type = false; +}; + +/* + * Tag to be used for representing the bottom of recursively defined types. + * Define this tag so we don't have to use void. + */ +struct recursive_bottom {}; + +/* + * Implementation detail of `recursive_container_traits` below. + * `T` is the `value_type` of the container, which might need to be modified to + * avoid recursive types and const types. + */ +template +struct impl_type_to_check_recursively { + /* + * If the container is recursive, then no further recursion should be done. + */ + using if_recursive = recursive_bottom; + /* + * Otherwise yield `T` unchanged. + */ + using if_not_recursive = T; +}; + +/* + * For pairs - only as value type of a map -, the first type should remove the `const`. + * Also, if the map is recursive, then the recursive checking should consider + * the first type only. + */ +template +struct impl_type_to_check_recursively, /* is_this_a_map = */ true> { + using if_recursive = typename std::remove_const::type; + using if_not_recursive = std::pair::type, B>; +}; + +/* + * Implementation of `recursive_container_traits` below. + */ +template +struct impl_recursive_container_traits { + using type_to_check_recursively = recursive_bottom; +}; + +template +struct impl_recursive_container_traits< + Container, + typename std::enable_if::has_value_type>::type> { + static constexpr bool is_recursive + = container_mapped_type_traits::has_recursive_mapped_type + || container_value_type_traits::has_recursive_value_type; + /* + * This member dictates which type Pybind11 should check recursively in traits + * such as `is_move_constructible`, `is_copy_constructible`, `is_move_assignable`, ... + * Direct access to `value_type` should be avoided: + * 1. `value_type` might recursively contain the type again + * 2. `value_type` of STL map types is `std::pair`, the `const` + * should be removed. + * + */ + using type_to_check_recursively = typename std::conditional< + is_recursive, + typename impl_type_to_check_recursively< + typename Container::value_type, + container_mapped_type_traits::has_mapped_type>::if_recursive, + typename impl_type_to_check_recursively< + typename Container::value_type, + container_mapped_type_traits::has_mapped_type>::if_not_recursive>::type; +}; + +/* + * This trait defines the `type_to_check_recursively` which is needed to properly + * handle recursively defined traits such as `is_move_constructible` without going + * into an infinite recursion. + * Should be used instead of directly accessing the `value_type`. + * It cancels the recursion by returning the `recursive_bottom` tag. + * + * The default definition of `type_to_check_recursively` is as follows: + * + * 1. By default, it is `recursive_bottom`, so that the recursion is canceled. + * 2. If the type is non-recursive and defines a `value_type`, then the `value_type` is used. + * If the `value_type` is a pair and a `mapped_type` is defined, + * then the `const` is removed from the first type. + * 3. If the type is recursive and `value_type` is not a pair, then `recursive_bottom` is returned. + * 4. If the type is recursive and `value_type` is a pair and a `mapped_type` is defined, + * then `const` is removed from the first type and the first type is returned. + * + * This behavior can be extended by the user as seen in test_stl_binders.cpp. + * + * This struct is exactly the same as impl_recursive_container_traits. + * The duplication achieves that user-defined specializations don't compete + * with internal specializations, but take precedence. + */ +template +struct recursive_container_traits : impl_recursive_container_traits {}; + +template +struct is_move_constructible + : all_of, + is_move_constructible< + typename recursive_container_traits::type_to_check_recursively>> {}; + +template <> +struct is_move_constructible : std::true_type {}; + +// Likewise for std::pair +// (after C++17 it is mandatory that the move constructor not exist when the two types aren't +// themselves move constructible, but this can not be relied upon when T1 or T2 are themselves +// containers). +template +struct is_move_constructible> + : all_of, is_move_constructible> {}; + +// std::is_copy_constructible isn't quite enough: it lets std::vector (and similar) through when +// T is non-copyable, but code containing such a copy constructor fails to actually compile. +template +struct is_copy_constructible + : all_of, + is_copy_constructible< + typename recursive_container_traits::type_to_check_recursively>> {}; + +template <> +struct is_copy_constructible : std::true_type {}; + +// Likewise for std::pair +// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't +// themselves copy constructible, but this can not be relied upon when T1 or T2 are themselves +// containers). +template +struct is_copy_constructible> + : all_of, is_copy_constructible> {}; + +// The same problems arise with std::is_copy_assignable, so we use the same workaround. +template +struct is_copy_assignable + : all_of< + std::is_copy_assignable, + is_copy_assignable::type_to_check_recursively>> { +}; + +template <> +struct is_copy_assignable : std::true_type {}; + +template +struct is_copy_assignable> + : all_of, is_copy_assignable> {}; + +PYBIND11_NAMESPACE_END(detail) + +// polymorphic_type_hook::get(src, tinfo) determines whether the object pointed +// to by `src` actually is an instance of some class derived from `itype`. +// If so, it sets `tinfo` to point to the std::type_info representing that derived +// type, and returns a pointer to the start of the most-derived object of that type +// (in which `src` is a subobject; this will be the same address as `src` in most +// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src` +// and leaves `tinfo` at its default value of nullptr. +// +// The default polymorphic_type_hook just returns src. A specialization for polymorphic +// types determines the runtime type of the passed object and adjusts the this-pointer +// appropriately via dynamic_cast. This is what enables a C++ Animal* to appear +// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is +// registered with pybind11, and this Animal is in fact a Dog). +// +// You may specialize polymorphic_type_hook yourself for types that want to appear +// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern +// in performance-sensitive applications, used most notably in LLVM.) +// +// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with +// std::enable_if. User provided specializations will always have higher priority than +// the default implementation and specialization provided in polymorphic_type_hook_base. +template +struct polymorphic_type_hook_base { + static const void *get(const itype *src, const std::type_info *&) { return src; } +}; +template +struct polymorphic_type_hook_base::value>> { + static const void *get(const itype *src, const std::type_info *&type) { + type = src ? &typeid(*src) : nullptr; + return dynamic_cast(src); + } +}; +template +struct polymorphic_type_hook : public polymorphic_type_hook_base {}; + +PYBIND11_NAMESPACE_BEGIN(detail) + +/// Generic type caster for objects stored on the heap +template +class type_caster_base : public type_caster_generic { + using itype = intrinsic_t; + +public: + static constexpr auto name = const_name(); + + type_caster_base() : type_caster_base(typeid(type)) {} + explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) {} + + static handle cast(const itype &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast(std::addressof(src), policy, parent); + } + + static handle cast(itype &&src, return_value_policy, handle parent) { + return cast(std::addressof(src), return_value_policy::move, parent); + } + + // Returns a (pointer, type_info) pair taking care of necessary type lookup for a + // polymorphic type (using RTTI by default, but can be overridden by specializing + // polymorphic_type_hook). If the instance isn't derived, returns the base version. + static std::pair src_and_type(const itype *src) { + const auto &cast_type = typeid(itype); + const std::type_info *instance_type = nullptr; + const void *vsrc = polymorphic_type_hook::get(src, instance_type); + if (instance_type && !same_type(cast_type, *instance_type)) { + // This is a base pointer to a derived type. If the derived type is registered + // with pybind11, we want to make the full derived object available. + // In the typical case where itype is polymorphic, we get the correct + // derived pointer (which may be != base pointer) by a dynamic_cast to + // most derived type. If itype is not polymorphic, we won't get here + // except via a user-provided specialization of polymorphic_type_hook, + // and the user has promised that no this-pointer adjustment is + // required in that case, so it's OK to use static_cast. + if (const auto *tpi = get_type_info(*instance_type)) { + return {vsrc, tpi}; + } + } + // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, + // so don't do a cast + return type_caster_generic::src_and_type(src, cast_type, instance_type); + } + + static handle cast(const itype *src, return_value_policy policy, handle parent) { + auto st = src_and_type(src); + return type_caster_generic::cast(st.first, + policy, + parent, + st.second, + make_copy_constructor(src), + make_move_constructor(src)); + } + + static handle cast_holder(const itype *src, const void *holder) { + auto st = src_and_type(src); + return type_caster_generic::cast(st.first, + return_value_policy::take_ownership, + {}, + st.second, + nullptr, + nullptr, + holder); + } + + template + using cast_op_type = detail::cast_op_type; + + // NOLINTNEXTLINE(google-explicit-constructor) + operator itype *() { return (type *) value; } + // NOLINTNEXTLINE(google-explicit-constructor) + operator itype &() { + if (!value) { + throw reference_cast_error(); + } + return *((itype *) value); + } + +protected: + using Constructor = void *(*) (const void *); + + /* Only enabled when the types are {copy,move}-constructible *and* when the type + does not have a private operator new implementation. A comma operator is used in the + decltype argument to apply SFINAE to the public copy/move constructors.*/ + template ::value>> + static auto make_copy_constructor(const T *) -> decltype(new T(std::declval()), + Constructor{}) { + return [](const void *arg) -> void * { return new T(*reinterpret_cast(arg)); }; + } + + template ::value>> + static auto make_move_constructor(const T *) -> decltype(new T(std::declval()), + Constructor{}) { + return [](const void *arg) -> void * { + return new T(std::move(*const_cast(reinterpret_cast(arg)))); + }; + } + + static Constructor make_copy_constructor(...) { return nullptr; } + static Constructor make_move_constructor(...) { return nullptr; } +}; + +inline std::string quote_cpp_type_name(const std::string &cpp_type_name) { + return cpp_type_name; // No-op for now. See PR #4888 +} + +PYBIND11_NOINLINE std::string type_info_description(const std::type_info &ti) { + if (auto *type_data = get_type_info(ti)) { + handle th((PyObject *) type_data->type); + return th.attr("__module__").cast() + '.' + + th.attr("__qualname__").cast(); + } + return quote_cpp_type_name(clean_type_id(ti.name())); +} + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h new file mode 100644 index 0000000000000000000000000000000000000000..24f56d158442c2e2f4667806e91fd94b476cc96e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h @@ -0,0 +1,9 @@ +// Copyright (c) 2023 The pybind Community. + +#pragma once + +// Common message for `static_assert()`s, which are useful to easily +// preempt much less obvious errors. +#define PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED \ + "Pointer types (in particular `PyObject *`) are not supported as scalar types for Eigen " \ + "types." diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..5cf1f0a2a0566a8155ef7bcc10806883286615ff --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h @@ -0,0 +1,715 @@ +/* + pybind11/eigen/matrix.h: Transparent conversion for dense and sparse Eigen matrices + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include + +#include "common.h" + +/* HINT: To suppress warnings originating from the Eigen headers, use -isystem. + See also: + https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir + https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler +*/ +PYBIND11_WARNING_PUSH +PYBIND11_WARNING_DISABLE_MSVC(5054) // https://github.com/pybind/pybind11/pull/3741 +// C5054: operator '&': deprecated between enumerations of different types +#if defined(__MINGW32__) +PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized") +#endif + +#include +#include + +PYBIND11_WARNING_POP + +// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit +// move constructors that break things. We could detect this an explicitly copy, but an extra copy +// of matrices seems highly undesirable. +static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7), + "Eigen matrix support in pybind11 requires Eigen >= 3.2.7"); + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_WARNING_DISABLE_MSVC(4127) + +// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides: +using EigenDStride = Eigen::Stride; +template +using EigenDRef = Eigen::Ref; +template +using EigenDMap = Eigen::Map; + +PYBIND11_NAMESPACE_BEGIN(detail) + +#if EIGEN_VERSION_AT_LEAST(3, 3, 0) +using EigenIndex = Eigen::Index; +template +using EigenMapSparseMatrix = Eigen::Map>; +#else +using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE; +template +using EigenMapSparseMatrix = Eigen::MappedSparseMatrix; +#endif + +// Matches Eigen::Map, Eigen::Ref, blocks, etc: +template +using is_eigen_dense_map = all_of, + std::is_base_of, T>>; +template +using is_eigen_mutable_map = std::is_base_of, T>; +template +using is_eigen_dense_plain + = all_of>, is_template_base_of>; +template +using is_eigen_sparse = is_template_base_of; +// Test for objects inheriting from EigenBase that aren't captured by the above. This +// basically covers anything that can be assigned to a dense matrix but that don't have a typical +// matrix data layout that can be copied from their .data(). For example, DiagonalMatrix and +// SelfAdjointView fall into this category. +template +using is_eigen_other + = all_of, + negation, is_eigen_dense_plain, is_eigen_sparse>>>; + +// Captures numpy/eigen conformability status (returned by EigenProps::conformable()): +template +struct EigenConformable { + bool conformable = false; + EigenIndex rows = 0, cols = 0; + EigenDStride stride{0, 0}; // Only valid if negativestrides is false! + bool negativestrides = false; // If true, do not use stride! + + // NOLINTNEXTLINE(google-explicit-constructor) + EigenConformable(bool fits = false) : conformable{fits} {} + // Matrix type: + EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride, EigenIndex cstride) + : conformable{true}, rows{r}, cols{c}, + // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. + // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747 + stride{EigenRowMajor ? (rstride > 0 ? rstride : 0) + : (cstride > 0 ? cstride : 0) /* outer stride */, + EigenRowMajor ? (cstride > 0 ? cstride : 0) + : (rstride > 0 ? rstride : 0) /* inner stride */}, + negativestrides{rstride < 0 || cstride < 0} {} + // Vector type: + EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride) + : EigenConformable(r, c, r == 1 ? c * stride : stride, c == 1 ? r : r * stride) {} + + template + bool stride_compatible() const { + // To have compatible strides, we need (on both dimensions) one of fully dynamic strides, + // matching strides, or a dimension size of 1 (in which case the stride value is + // irrelevant). Alternatively, if any dimension size is 0, the strides are not relevant + // (and numpy ≥ 1.23 sets the strides to 0 in that case, so we need to check explicitly). + if (negativestrides) { + return false; + } + if (rows == 0 || cols == 0) { + return true; + } + return (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() + || (EigenRowMajor ? cols : rows) == 1) + && (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() + || (EigenRowMajor ? rows : cols) == 1); + } + // NOLINTNEXTLINE(google-explicit-constructor) + operator bool() const { return conformable; } +}; + +template +struct eigen_extract_stride { + using type = Type; +}; +template +struct eigen_extract_stride> { + using type = StrideType; +}; +template +struct eigen_extract_stride> { + using type = StrideType; +}; + +// Helper struct for extracting information from an Eigen type +template +struct EigenProps { + using Type = Type_; + using Scalar = typename Type::Scalar; + using StrideType = typename eigen_extract_stride::type; + static constexpr EigenIndex rows = Type::RowsAtCompileTime, cols = Type::ColsAtCompileTime, + size = Type::SizeAtCompileTime; + static constexpr bool row_major = Type::IsRowMajor, + vector + = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1 + fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic, + fixed = size != Eigen::Dynamic, // Fully-fixed size + dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size + + template + using if_zero = std::integral_constant; + static constexpr EigenIndex inner_stride + = if_zero::value, + outer_stride = if_zero < StrideType::OuterStrideAtCompileTime, + vector ? size + : row_major ? cols + : rows > ::value; + static constexpr bool dynamic_stride + = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic; + static constexpr bool requires_row_major + = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1; + static constexpr bool requires_col_major + = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1; + + // Takes an input array and determines whether we can make it fit into the Eigen type. If + // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector + // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type). + static EigenConformable conformable(const array &a) { + const auto dims = a.ndim(); + if (dims < 1 || dims > 2) { + return false; + } + + if (dims == 2) { // Matrix type: require exact match (or dynamic) + + EigenIndex np_rows = a.shape(0), np_cols = a.shape(1), + np_rstride = a.strides(0) / static_cast(sizeof(Scalar)), + np_cstride = a.strides(1) / static_cast(sizeof(Scalar)); + if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) { + return false; + } + + return {np_rows, np_cols, np_rstride, np_cstride}; + } + + // Otherwise we're storing an n-vector. Only one of the strides will be used, but + // whichever is used, we want the (single) numpy stride value. + const EigenIndex n = a.shape(0), + stride = a.strides(0) / static_cast(sizeof(Scalar)); + + if (vector) { // Eigen type is a compile-time vector + if (fixed && size != n) { + return false; // Vector size mismatch + } + return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride}; + } + if (fixed) { + // The type has a fixed size, but is not a vector: abort + return false; + } + if (fixed_cols) { + // Since this isn't a vector, cols must be != 1. We allow this only if it exactly + // equals the number of elements (rows is Dynamic, and so 1 row is allowed). + if (cols != n) { + return false; + } + return {1, n, stride}; + } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector + if (fixed_rows && rows != n) { + return false; + } + return {n, 1, stride}; + } + + static constexpr bool show_writeable + = is_eigen_dense_map::value && is_eigen_mutable_map::value; + static constexpr bool show_order = is_eigen_dense_map::value; + static constexpr bool show_c_contiguous = show_order && requires_row_major; + static constexpr bool show_f_contiguous + = !show_c_contiguous && show_order && requires_col_major; + + static constexpr auto descriptor + = const_name("numpy.ndarray[") + npy_format_descriptor::name + const_name("[") + + const_name(const_name<(size_t) rows>(), const_name("m")) + const_name(", ") + + const_name(const_name<(size_t) cols>(), const_name("n")) + const_name("]") + + + // For a reference type (e.g. Ref) we have other constraints that might need to + // be satisfied: writeable=True (for a mutable reference), and, depending on the map's + // stride options, possibly f_contiguous or c_contiguous. We include them in the + // descriptor output to provide some hint as to why a TypeError is occurring (otherwise + // it can be confusing to see that a function accepts a 'numpy.ndarray[float64[3,2]]' and + // an error message that you *gave* a numpy.ndarray of the right type and dimensions. + const_name(", flags.writeable", "") + + const_name(", flags.c_contiguous", "") + + const_name(", flags.f_contiguous", "") + const_name("]"); +}; + +// Casts an Eigen type to numpy array. If given a base, the numpy array references the src data, +// otherwise it'll make a copy. writeable lets you turn off the writeable flag for the array. +template +handle +eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) { + constexpr ssize_t elem_size = sizeof(typename props::Scalar); + array a; + if (props::vector) { + a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base); + } else { + a = array({src.rows(), src.cols()}, + {elem_size * src.rowStride(), elem_size * src.colStride()}, + src.data(), + base); + } + + if (!writeable) { + array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_; + } + + return a.release(); +} + +// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that +// reference the Eigen object's data with `base` as the python-registered base class (if omitted, +// the base will be set to None, and lifetime management is up to the caller). The numpy array is +// non-writeable if the given type is const. +template +handle eigen_ref_array(Type &src, handle parent = none()) { + // none here is to get past array's should-we-copy detection, which currently always + // copies when there is no base. Setting the base to None should be harmless. + return eigen_array_cast(src, parent, !std::is_const::value); +} + +// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a +// numpy array that references the encapsulated data with a python-side reference to the capsule to +// tie its destruction to that of any dependent python objects. Const-ness is determined by +// whether or not the Type of the pointer given is const. +template ::value>> +handle eigen_encapsulate(Type *src) { + capsule base(src, [](void *o) { delete static_cast(o); }); + return eigen_ref_array(*src, base); +} + +// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense +// types. +template +struct type_caster::value>> { + using Scalar = typename Type::Scalar; + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + using props = EigenProps; + + bool load(handle src, bool convert) { + // If we're in no-convert mode, only load if given an array of the correct type + if (!convert && !isinstance>(src)) { + return false; + } + + // Coerce into an array, but don't do type conversion yet; the copy below handles it. + auto buf = array::ensure(src); + + if (!buf) { + return false; + } + + auto dims = buf.ndim(); + if (dims < 1 || dims > 2) { + return false; + } + + auto fits = props::conformable(buf); + if (!fits) { + return false; + } + + // Allocate the new type, then build a numpy reference into it + value = Type(fits.rows, fits.cols); + auto ref = reinterpret_steal(eigen_ref_array(value)); + if (dims == 1) { + ref = ref.squeeze(); + } else if (ref.ndim() == 1) { + buf = buf.squeeze(); + } + + int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr()); + + if (result < 0) { // Copy failed! + PyErr_Clear(); + return false; + } + + return true; + } + +private: + // Cast implementation + template + static handle cast_impl(CType *src, return_value_policy policy, handle parent) { + switch (policy) { + case return_value_policy::take_ownership: + case return_value_policy::automatic: + return eigen_encapsulate(src); + case return_value_policy::move: + return eigen_encapsulate(new CType(std::move(*src))); + case return_value_policy::copy: + return eigen_array_cast(*src); + case return_value_policy::reference: + case return_value_policy::automatic_reference: + return eigen_ref_array(*src); + case return_value_policy::reference_internal: + return eigen_ref_array(*src, parent); + default: + throw cast_error("unhandled return_value_policy: should not happen!"); + }; + } + +public: + // Normal returned non-reference, non-const value: + static handle cast(Type &&src, return_value_policy /* policy */, handle parent) { + return cast_impl(&src, return_value_policy::move, parent); + } + // If you return a non-reference const, we mark the numpy array readonly: + static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) { + return cast_impl(&src, return_value_policy::move, parent); + } + // lvalue reference return; default (automatic) becomes copy + static handle cast(Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast_impl(&src, policy, parent); + } + // const lvalue reference return; default (automatic) becomes copy + static handle cast(const Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast(&src, policy, parent); + } + // non-const pointer return + static handle cast(Type *src, return_value_policy policy, handle parent) { + return cast_impl(src, policy, parent); + } + // const pointer return + static handle cast(const Type *src, return_value_policy policy, handle parent) { + return cast_impl(src, policy, parent); + } + + static constexpr auto name = props::descriptor; + + // NOLINTNEXTLINE(google-explicit-constructor) + operator Type *() { return &value; } + // NOLINTNEXTLINE(google-explicit-constructor) + operator Type &() { return value; } + // NOLINTNEXTLINE(google-explicit-constructor) + operator Type &&() && { return std::move(value); } + template + using cast_op_type = movable_cast_op_type; + +private: + Type value; +}; + +// Base class for casting reference/map/block/etc. objects back to python. +template +struct eigen_map_caster { + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + +private: + using props = EigenProps; + +public: + // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has + // to stay around), but we'll allow it under the assumption that you know what you're doing + // (and have an appropriate keep_alive in place). We return a numpy array pointing directly at + // the ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) + // Note that this means you need to ensure you don't destroy the object in some other way (e.g. + // with an appropriate keep_alive, or with a reference to a statically allocated matrix). + static handle cast(const MapType &src, return_value_policy policy, handle parent) { + switch (policy) { + case return_value_policy::copy: + return eigen_array_cast(src); + case return_value_policy::reference_internal: + return eigen_array_cast(src, parent, is_eigen_mutable_map::value); + case return_value_policy::reference: + case return_value_policy::automatic: + case return_value_policy::automatic_reference: + return eigen_array_cast(src, none(), is_eigen_mutable_map::value); + default: + // move, take_ownership don't make any sense for a ref/map: + pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type"); + } + } + + static constexpr auto name = props::descriptor; + + // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return + // types but not bound arguments). We still provide them (with an explicitly delete) so that + // you end up here if you try anyway. + bool load(handle, bool) = delete; + operator MapType() = delete; + template + using cast_op_type = MapType; +}; + +// We can return any map-like object (but can only load Refs, specialized next): +template +struct type_caster::value>> : eigen_map_caster {}; + +// Loader for Ref<...> arguments. See the documentation for info on how to make this work without +// copying (it requires some extra effort in many cases). +template +struct type_caster< + Eigen::Ref, + enable_if_t>::value>> + : public eigen_map_caster> { +private: + using Type = Eigen::Ref; + using props = EigenProps; + using Scalar = typename props::Scalar; + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + using MapType = Eigen::Map; + using Array + = array_t; + static constexpr bool need_writeable = is_eigen_mutable_map::value; + // Delay construction (these have no default constructor) + std::unique_ptr map; + std::unique_ptr ref; + // Our array. When possible, this is just a numpy array pointing to the source data, but + // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an + // incompatible layout, or is an array of a type that needs to be converted). Using a numpy + // temporary (rather than an Eigen temporary) saves an extra copy when we need both type + // conversion and storage order conversion. (Note that we refuse to use this temporary copy + // when loading an argument for a Ref with M non-const, i.e. a read-write reference). + Array copy_or_ref; + +public: + bool load(handle src, bool convert) { + // First check whether what we have is already an array of the right type. If not, we + // can't avoid a copy (because the copy is also going to do type conversion). + bool need_copy = !isinstance(src); + + EigenConformable fits; + if (!need_copy) { + // We don't need a converting copy, but we also need to check whether the strides are + // compatible with the Ref's stride requirements + auto aref = reinterpret_borrow(src); + + if (aref && (!need_writeable || aref.writeable())) { + fits = props::conformable(aref); + if (!fits) { + return false; // Incompatible dimensions + } + if (!fits.template stride_compatible()) { + need_copy = true; + } else { + copy_or_ref = std::move(aref); + } + } else { + need_copy = true; + } + } + + if (need_copy) { + // We need to copy: If we need a mutable reference, or we're not supposed to convert + // (either because we're in the no-convert overload pass, or because we're explicitly + // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading. + if (!convert || need_writeable) { + return false; + } + + Array copy = Array::ensure(src); + if (!copy) { + return false; + } + fits = props::conformable(copy); + if (!fits || !fits.template stride_compatible()) { + return false; + } + copy_or_ref = std::move(copy); + loader_life_support::add_patient(copy_or_ref); + } + + ref.reset(); + map.reset(new MapType(data(copy_or_ref), + fits.rows, + fits.cols, + make_stride(fits.stride.outer(), fits.stride.inner()))); + ref.reset(new Type(*map)); + + return true; + } + + // NOLINTNEXTLINE(google-explicit-constructor) + operator Type *() { return ref.get(); } + // NOLINTNEXTLINE(google-explicit-constructor) + operator Type &() { return *ref; } + template + using cast_op_type = pybind11::detail::cast_op_type<_T>; + +private: + template ::value, int> = 0> + Scalar *data(Array &a) { + return a.mutable_data(); + } + + template ::value, int> = 0> + const Scalar *data(Array &a) { + return a.data(); + } + + // Attempt to figure out a constructor of `Stride` that will work. + // If both strides are fixed, use a default constructor: + template + using stride_ctor_default = bool_constant::value>; + // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like + // Eigen::Stride, and use it: + template + using stride_ctor_dual + = bool_constant::value + && std::is_constructible::value>; + // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use + // it (passing whichever stride is dynamic). + template + using stride_ctor_outer + = bool_constant, stride_ctor_dual>::value + && S::OuterStrideAtCompileTime == Eigen::Dynamic + && S::InnerStrideAtCompileTime != Eigen::Dynamic + && std::is_constructible::value>; + template + using stride_ctor_inner + = bool_constant, stride_ctor_dual>::value + && S::InnerStrideAtCompileTime == Eigen::Dynamic + && S::OuterStrideAtCompileTime != Eigen::Dynamic + && std::is_constructible::value>; + + template ::value, int> = 0> + static S make_stride(EigenIndex, EigenIndex) { + return S(); + } + template ::value, int> = 0> + static S make_stride(EigenIndex outer, EigenIndex inner) { + return S(outer, inner); + } + template ::value, int> = 0> + static S make_stride(EigenIndex outer, EigenIndex) { + return S(outer); + } + template ::value, int> = 0> + static S make_stride(EigenIndex, EigenIndex inner) { + return S(inner); + } +}; + +// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not +// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout). +// load() is not supported, but we can cast them into the python domain by first copying to a +// regular Eigen::Matrix, then casting that. +template +struct type_caster::value>> { + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + +protected: + using Matrix + = Eigen::Matrix; + using props = EigenProps; + +public: + static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) { + handle h = eigen_encapsulate(new Matrix(src)); + return h; + } + static handle cast(const Type *src, return_value_policy policy, handle parent) { + return cast(*src, policy, parent); + } + + static constexpr auto name = props::descriptor; + + // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return + // types but not bound arguments). We still provide them (with an explicitly delete) so that + // you end up here if you try anyway. + bool load(handle, bool) = delete; + operator Type() = delete; + template + using cast_op_type = Type; +}; + +template +struct type_caster::value>> { + using Scalar = typename Type::Scalar; + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + using StorageIndex = remove_reference_t().outerIndexPtr())>; + using Index = typename Type::Index; + static constexpr bool rowMajor = Type::IsRowMajor; + + bool load(handle src, bool) { + if (!src) { + return false; + } + + auto obj = reinterpret_borrow(src); + object sparse_module = module_::import("scipy.sparse"); + object matrix_type = sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix"); + + if (!type::handle_of(obj).is(matrix_type)) { + try { + obj = matrix_type(obj); + } catch (const error_already_set &) { + return false; + } + } + + auto values = array_t((object) obj.attr("data")); + auto innerIndices = array_t((object) obj.attr("indices")); + auto outerIndices = array_t((object) obj.attr("indptr")); + auto shape = pybind11::tuple((pybind11::object) obj.attr("shape")); + auto nnz = obj.attr("nnz").cast(); + + if (!values || !innerIndices || !outerIndices) { + return false; + } + + value = EigenMapSparseMatrix(shape[0].cast(), + shape[1].cast(), + std::move(nnz), + outerIndices.mutable_data(), + innerIndices.mutable_data(), + values.mutable_data()); + + return true; + } + + static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) { + const_cast(src).makeCompressed(); + + object matrix_type + = module_::import("scipy.sparse").attr(rowMajor ? "csr_matrix" : "csc_matrix"); + + array data(src.nonZeros(), src.valuePtr()); + array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr()); + array innerIndices(src.nonZeros(), src.innerIndexPtr()); + + return matrix_type(pybind11::make_tuple( + std::move(data), std::move(innerIndices), std::move(outerIndices)), + pybind11::make_tuple(src.rows(), src.cols())) + .release(); + } + + PYBIND11_TYPE_CASTER(Type, + const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", + "scipy.sparse.csc_matrix[") + + npy_format_descriptor::name + const_name("]")); +}; + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..0a9d7c2522a2cf90c8f9bfdcf9b993e08ee20cf0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h @@ -0,0 +1,515 @@ +/* + pybind11/eigen/tensor.h: Transparent conversion for Eigen tensors + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include + +#include "common.h" + +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +static_assert(__GNUC__ > 5, "Eigen Tensor support in pybind11 requires GCC > 5.0"); +#endif + +// Disable warnings for Eigen +PYBIND11_WARNING_PUSH +PYBIND11_WARNING_DISABLE_MSVC(4554) +PYBIND11_WARNING_DISABLE_MSVC(4127) +#if defined(__MINGW32__) +PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized") +#endif + +#include + +PYBIND11_WARNING_POP + +static_assert(EIGEN_VERSION_AT_LEAST(3, 3, 0), + "Eigen Tensor support in pybind11 requires Eigen >= 3.3.0"); + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_WARNING_DISABLE_MSVC(4127) + +PYBIND11_NAMESPACE_BEGIN(detail) + +inline bool is_tensor_aligned(const void *data) { + return (reinterpret_cast(data) % EIGEN_DEFAULT_ALIGN_BYTES) == 0; +} + +template +constexpr int compute_array_flag_from_tensor() { + static_assert((static_cast(T::Layout) == static_cast(Eigen::RowMajor)) + || (static_cast(T::Layout) == static_cast(Eigen::ColMajor)), + "Layout must be row or column major"); + return (static_cast(T::Layout) == static_cast(Eigen::RowMajor)) ? array::c_style + : array::f_style; +} + +template +struct eigen_tensor_helper {}; + +template +struct eigen_tensor_helper> { + using Type = Eigen::Tensor; + using ValidType = void; + + static Eigen::DSizes get_shape(const Type &f) { + return f.dimensions(); + } + + static constexpr bool + is_correct_shape(const Eigen::DSizes & /*shape*/) { + return true; + } + + template + struct helper {}; + + template + struct helper> { + static constexpr auto value = ::pybind11::detail::concat(const_name(((void) Is, "?"))...); + }; + + static constexpr auto dimensions_descriptor + = helper())>::value; + + template + static Type *alloc(Args &&...args) { + return new Type(std::forward(args)...); + } + + static void free(Type *tensor) { delete tensor; } +}; + +template +struct eigen_tensor_helper< + Eigen::TensorFixedSize, Options_, IndexType>> { + using Type = Eigen::TensorFixedSize, Options_, IndexType>; + using ValidType = void; + + static constexpr Eigen::DSizes + get_shape(const Type & /*f*/) { + return get_shape(); + } + + static constexpr Eigen::DSizes get_shape() { + return Eigen::DSizes(Indices...); + } + + static bool + is_correct_shape(const Eigen::DSizes &shape) { + return get_shape() == shape; + } + + static constexpr auto dimensions_descriptor + = ::pybind11::detail::concat(const_name()...); + + template + static Type *alloc(Args &&...args) { + Eigen::aligned_allocator allocator; + return ::new (allocator.allocate(1)) Type(std::forward(args)...); + } + + static void free(Type *tensor) { + Eigen::aligned_allocator allocator; + tensor->~Type(); + allocator.deallocate(tensor, 1); + } +}; + +template +struct get_tensor_descriptor { + static constexpr auto details + = const_name(", flags.writeable", "") + + const_name(Type::Layout) == static_cast(Eigen::RowMajor)>( + ", flags.c_contiguous", ", flags.f_contiguous"); + static constexpr auto value + = const_name("numpy.ndarray[") + npy_format_descriptor::name + + const_name("[") + eigen_tensor_helper>::dimensions_descriptor + + const_name("]") + const_name(details, const_name("")) + const_name("]"); +}; + +// When EIGEN_AVOID_STL_ARRAY is defined, Eigen::DSizes does not have the begin() member +// function. Falling back to a simple loop works around this issue. +// +// We need to disable the type-limits warning for the inner loop when size = 0. + +PYBIND11_WARNING_PUSH +PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") + +template +std::vector convert_dsizes_to_vector(const Eigen::DSizes &arr) { + std::vector result(size); + + for (size_t i = 0; i < size; i++) { + result[i] = arr[i]; + } + + return result; +} + +template +Eigen::DSizes get_shape_for_array(const array &arr) { + Eigen::DSizes result; + const T *shape = arr.shape(); + for (size_t i = 0; i < size; i++) { + result[i] = shape[i]; + } + + return result; +} + +PYBIND11_WARNING_POP + +template +struct type_caster::ValidType> { + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + using Helper = eigen_tensor_helper; + static constexpr auto temp_name = get_tensor_descriptor::value; + PYBIND11_TYPE_CASTER(Type, temp_name); + + bool load(handle src, bool convert) { + if (!convert) { + if (!isinstance(src)) { + return false; + } + array temp = array::ensure(src); + if (!temp) { + return false; + } + + if (!temp.dtype().is(dtype::of())) { + return false; + } + } + + array_t()> arr( + reinterpret_borrow(src)); + + if (arr.ndim() != Type::NumIndices) { + return false; + } + auto shape = get_shape_for_array(arr); + + if (!Helper::is_correct_shape(shape)) { + return false; + } + +#if EIGEN_VERSION_AT_LEAST(3, 4, 0) + auto data_pointer = arr.data(); +#else + // Handle Eigen bug + auto data_pointer = const_cast(arr.data()); +#endif + + if (is_tensor_aligned(arr.data())) { + value = Eigen::TensorMap(data_pointer, shape); + } else { + value = Eigen::TensorMap(data_pointer, shape); + } + + return true; + } + + static handle cast(Type &&src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::reference + || policy == return_value_policy::reference_internal) { + pybind11_fail("Cannot use a reference return value policy for an rvalue"); + } + return cast_impl(&src, return_value_policy::move, parent); + } + + static handle cast(const Type &&src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::reference + || policy == return_value_policy::reference_internal) { + pybind11_fail("Cannot use a reference return value policy for an rvalue"); + } + return cast_impl(&src, return_value_policy::move, parent); + } + + static handle cast(Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast_impl(&src, policy, parent); + } + + static handle cast(const Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast(&src, policy, parent); + } + + static handle cast(Type *src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic) { + policy = return_value_policy::take_ownership; + } else if (policy == return_value_policy::automatic_reference) { + policy = return_value_policy::reference; + } + return cast_impl(src, policy, parent); + } + + static handle cast(const Type *src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic) { + policy = return_value_policy::take_ownership; + } else if (policy == return_value_policy::automatic_reference) { + policy = return_value_policy::reference; + } + return cast_impl(src, policy, parent); + } + + template + static handle cast_impl(C *src, return_value_policy policy, handle parent) { + object parent_object; + bool writeable = false; + switch (policy) { + case return_value_policy::move: + if (std::is_const::value) { + pybind11_fail("Cannot move from a constant reference"); + } + + src = Helper::alloc(std::move(*src)); + + parent_object + = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast(ptr)); }); + writeable = true; + break; + + case return_value_policy::take_ownership: + if (std::is_const::value) { + // This cast is ugly, and might be UB in some cases, but we don't have an + // alternative here as we must free that memory + Helper::free(const_cast(src)); + pybind11_fail("Cannot take ownership of a const reference"); + } + + parent_object + = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast(ptr)); }); + writeable = true; + break; + + case return_value_policy::copy: + writeable = true; + break; + + case return_value_policy::reference: + parent_object = none(); + writeable = !std::is_const::value; + break; + + case return_value_policy::reference_internal: + // Default should do the right thing + if (!parent) { + pybind11_fail("Cannot use reference internal when there is no parent"); + } + parent_object = reinterpret_borrow(parent); + writeable = !std::is_const::value; + break; + + default: + pybind11_fail("pybind11 bug in eigen.h, please file a bug report"); + } + + auto result = array_t()>( + convert_dsizes_to_vector(Helper::get_shape(*src)), src->data(), parent_object); + + if (!writeable) { + array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_; + } + + return result.release(); + } +}; + +template = true> +StoragePointerType get_array_data_for_type(array &arr) { +#if EIGEN_VERSION_AT_LEAST(3, 4, 0) + return reinterpret_cast(arr.data()); +#else + // Handle Eigen bug + return reinterpret_cast(const_cast(arr.data())); +#endif +} + +template = true> +StoragePointerType get_array_data_for_type(array &arr) { + return reinterpret_cast(arr.mutable_data()); +} + +template +struct get_storage_pointer_type; + +template +struct get_storage_pointer_type> { + using SPT = typename MapType::StoragePointerType; +}; + +template +struct get_storage_pointer_type> { + using SPT = typename MapType::PointerArgType; +}; + +template +struct type_caster, + typename eigen_tensor_helper>::ValidType> { + static_assert(!std::is_pointer::value, + PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED); + using MapType = Eigen::TensorMap; + using Helper = eigen_tensor_helper>; + + bool load(handle src, bool /*convert*/) { + // Note that we have a lot more checks here as we want to make sure to avoid copies + if (!isinstance(src)) { + return false; + } + auto arr = reinterpret_borrow(src); + if ((arr.flags() & compute_array_flag_from_tensor()) == 0) { + return false; + } + + if (!arr.dtype().is(dtype::of())) { + return false; + } + + if (arr.ndim() != Type::NumIndices) { + return false; + } + + constexpr bool is_aligned = (Options & Eigen::Aligned) != 0; + + if (is_aligned && !is_tensor_aligned(arr.data())) { + return false; + } + + auto shape = get_shape_for_array(arr); + + if (!Helper::is_correct_shape(shape)) { + return false; + } + + if (needs_writeable && !arr.writeable()) { + return false; + } + + auto result = get_array_data_for_type::SPT, + needs_writeable>(arr); + + value.reset(new MapType(std::move(result), std::move(shape))); + + return true; + } + + static handle cast(MapType &&src, return_value_policy policy, handle parent) { + return cast_impl(&src, policy, parent); + } + + static handle cast(const MapType &&src, return_value_policy policy, handle parent) { + return cast_impl(&src, policy, parent); + } + + static handle cast(MapType &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast_impl(&src, policy, parent); + } + + static handle cast(const MapType &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic + || policy == return_value_policy::automatic_reference) { + policy = return_value_policy::copy; + } + return cast(&src, policy, parent); + } + + static handle cast(MapType *src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic) { + policy = return_value_policy::take_ownership; + } else if (policy == return_value_policy::automatic_reference) { + policy = return_value_policy::reference; + } + return cast_impl(src, policy, parent); + } + + static handle cast(const MapType *src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic) { + policy = return_value_policy::take_ownership; + } else if (policy == return_value_policy::automatic_reference) { + policy = return_value_policy::reference; + } + return cast_impl(src, policy, parent); + } + + template + static handle cast_impl(C *src, return_value_policy policy, handle parent) { + object parent_object; + constexpr bool writeable = !std::is_const::value; + switch (policy) { + case return_value_policy::reference: + parent_object = none(); + break; + + case return_value_policy::reference_internal: + // Default should do the right thing + if (!parent) { + pybind11_fail("Cannot use reference internal when there is no parent"); + } + parent_object = reinterpret_borrow(parent); + break; + + default: + // move, take_ownership don't make any sense for a ref/map: + pybind11_fail("Invalid return_value_policy for Eigen Map type, must be either " + "reference or reference_internal"); + } + + auto result = array_t()>( + convert_dsizes_to_vector(Helper::get_shape(*src)), + src->data(), + std::move(parent_object)); + + if (!writeable) { + array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_; + } + + return result.release(); + } + +#if EIGEN_VERSION_AT_LEAST(3, 4, 0) + + static constexpr bool needs_writeable = !std::is_const::SPT>::type>::value; +#else + // Handle Eigen bug + static constexpr bool needs_writeable = !std::is_const::value; +#endif + +protected: + // TODO: Move to std::optional once std::optional has more support + std::unique_ptr value; + +public: + static constexpr auto name = get_tensor_descriptor::value; + explicit operator MapType *() { return value.get(); } + explicit operator MapType &() { return *value; } + explicit operator MapType &&() && { return std::move(*value); } + + template + using cast_op_type = ::pybind11::detail::movable_cast_op_type; +}; + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h new file mode 100644 index 0000000000000000000000000000000000000000..09894cf74f4aef2b91f657f84ebfb6af3e7e8d0f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h @@ -0,0 +1,2139 @@ +/* + pybind11/numpy.h: Basic NumPy support, vectorize() wrapper + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include "detail/common.h" +#include "complex.h" +#include "gil_safe_call_once.h" +#include "pytypes.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(PYBIND11_NUMPY_1_ONLY) && !defined(PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED) +# error PYBIND11_NUMPY_1_ONLY must be defined before any pybind11 header is included. +#endif + +/* This will be true on all flat address space platforms and allows us to reduce the + whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size + and dimension types (e.g. shape, strides, indexing), instead of inflicting this + upon the library user. + Note that NumPy 2 now uses ssize_t for `npy_intp` to simplify this. */ +static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t"); +static_assert(std::is_signed::value, "Py_intptr_t must be signed"); +// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares) + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_WARNING_DISABLE_MSVC(4127) + +class dtype; // Forward declaration +class array; // Forward declaration + +PYBIND11_NAMESPACE_BEGIN(detail) + +template <> +struct handle_type_name { + static constexpr auto name = const_name("numpy.dtype"); +}; + +template <> +struct handle_type_name { + static constexpr auto name = const_name("numpy.ndarray"); +}; + +template +struct npy_format_descriptor; + +/* NumPy 1 proxy (always includes legacy fields) */ +struct PyArrayDescr1_Proxy { + PyObject_HEAD + PyObject *typeobj; + char kind; + char type; + char byteorder; + char flags; + int type_num; + int elsize; + int alignment; + char *subarray; + PyObject *fields; + PyObject *names; +}; + +#ifndef PYBIND11_NUMPY_1_ONLY +struct PyArrayDescr_Proxy { + PyObject_HEAD + PyObject *typeobj; + char kind; + char type; + char byteorder; + char _former_flags; + int type_num; + /* Additional fields are NumPy version specific. */ +}; +#else +/* NumPy 1.x only, we can expose all fields */ +using PyArrayDescr_Proxy = PyArrayDescr1_Proxy; +#endif + +/* NumPy 2 proxy, including legacy fields */ +struct PyArrayDescr2_Proxy { + PyObject_HEAD + PyObject *typeobj; + char kind; + char type; + char byteorder; + char _former_flags; + int type_num; + std::uint64_t flags; + ssize_t elsize; + ssize_t alignment; + PyObject *metadata; + Py_hash_t hash; + void *reserved_null[2]; + /* The following fields only exist if 0 <= type_num < 2056 */ + char *subarray; + PyObject *fields; + PyObject *names; +}; + +struct PyArray_Proxy { + PyObject_HEAD + char *data; + int nd; + ssize_t *dimensions; + ssize_t *strides; + PyObject *base; + PyObject *descr; + int flags; +}; + +struct PyVoidScalarObject_Proxy { + PyObject_VAR_HEAD char *obval; + PyArrayDescr_Proxy *descr; + int flags; + PyObject *base; +}; + +struct numpy_type_info { + PyObject *dtype_ptr; + std::string format_str; +}; + +struct numpy_internals { + std::unordered_map registered_dtypes; + + numpy_type_info *get_type_info(const std::type_info &tinfo, bool throw_if_missing = true) { + auto it = registered_dtypes.find(std::type_index(tinfo)); + if (it != registered_dtypes.end()) { + return &(it->second); + } + if (throw_if_missing) { + pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name()); + } + return nullptr; + } + + template + numpy_type_info *get_type_info(bool throw_if_missing = true) { + return get_type_info(typeid(typename std::remove_cv::type), throw_if_missing); + } +}; + +PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) { + ptr = &get_or_create_shared_data("_numpy_internals"); +} + +inline numpy_internals &get_numpy_internals() { + static numpy_internals *ptr = nullptr; + if (!ptr) { + load_numpy_internals(ptr); + } + return *ptr; +} + +PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) { + module_ numpy = module_::import("numpy"); + str version_string = numpy.attr("__version__"); + + module_ numpy_lib = module_::import("numpy.lib"); + object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); + int major_version = numpy_version.attr("major").cast(); + +#ifdef PYBIND11_NUMPY_1_ONLY + if (major_version >= 2) { + throw std::runtime_error( + "This extension was built with PYBIND11_NUMPY_1_ONLY defined, " + "but NumPy 2 is used in this process. For NumPy2 compatibility, " + "this extension needs to be rebuilt without the PYBIND11_NUMPY_1_ONLY define."); + } +#endif + /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially + became a private module. */ + std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core"; + return module_::import((numpy_core_path + "." + submodule_name).c_str()); +} + +template +struct same_size { + template + using as = bool_constant; +}; + +template +constexpr int platform_lookup() { + return -1; +} + +// Lookup a type according to its size, and return a value corresponding to the NumPy typenum. +template +constexpr int platform_lookup(int I, Ints... Is) { + return sizeof(Concrete) == sizeof(T) ? I : platform_lookup(Is...); +} + +struct npy_api { + enum constants { + NPY_ARRAY_C_CONTIGUOUS_ = 0x0001, + NPY_ARRAY_F_CONTIGUOUS_ = 0x0002, + NPY_ARRAY_OWNDATA_ = 0x0004, + NPY_ARRAY_FORCECAST_ = 0x0010, + NPY_ARRAY_ENSUREARRAY_ = 0x0040, + NPY_ARRAY_ALIGNED_ = 0x0100, + NPY_ARRAY_WRITEABLE_ = 0x0400, + NPY_BOOL_ = 0, + NPY_BYTE_, + NPY_UBYTE_, + NPY_SHORT_, + NPY_USHORT_, + NPY_INT_, + NPY_UINT_, + NPY_LONG_, + NPY_ULONG_, + NPY_LONGLONG_, + NPY_ULONGLONG_, + NPY_FLOAT_, + NPY_DOUBLE_, + NPY_LONGDOUBLE_, + NPY_CFLOAT_, + NPY_CDOUBLE_, + NPY_CLONGDOUBLE_, + NPY_OBJECT_ = 17, + NPY_STRING_, + NPY_UNICODE_, + NPY_VOID_, + // Platform-dependent normalization + NPY_INT8_ = NPY_BYTE_, + NPY_UINT8_ = NPY_UBYTE_, + NPY_INT16_ = NPY_SHORT_, + NPY_UINT16_ = NPY_USHORT_, + // `npy_common.h` defines the integer aliases. In order, it checks: + // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR + // and assigns the alias to the first matching size, so we should check in this order. + NPY_INT32_ + = platform_lookup(NPY_LONG_, NPY_INT_, NPY_SHORT_), + NPY_UINT32_ = platform_lookup( + NPY_ULONG_, NPY_UINT_, NPY_USHORT_), + NPY_INT64_ + = platform_lookup(NPY_LONG_, NPY_LONGLONG_, NPY_INT_), + NPY_UINT64_ + = platform_lookup( + NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_), + }; + + unsigned int PyArray_RUNTIME_VERSION_; + + struct PyArray_Dims { + Py_intptr_t *ptr; + int len; + }; + + static npy_api &get() { + PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; + return storage.call_once_and_store_result(lookup).get_stored(); + } + + bool PyArray_Check_(PyObject *obj) const { + return PyObject_TypeCheck(obj, PyArray_Type_) != 0; + } + bool PyArrayDescr_Check_(PyObject *obj) const { + return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0; + } + + unsigned int (*PyArray_GetNDArrayCFeatureVersion_)(); + PyObject *(*PyArray_DescrFromType_)(int); + PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *, + PyObject *, + int, + Py_intptr_t const *, + Py_intptr_t const *, + void *, + int, + PyObject *); + // Unused. Not removed because that affects ABI of the class. + PyObject *(*PyArray_DescrNewFromType_)(int); + int (*PyArray_CopyInto_)(PyObject *, PyObject *); + PyObject *(*PyArray_NewCopy_)(PyObject *, int); + PyTypeObject *PyArray_Type_; + PyTypeObject *PyVoidArrType_Type_; + PyTypeObject *PyArrayDescr_Type_; + PyObject *(*PyArray_DescrFromScalar_)(PyObject *); + PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int, PyObject *); + int (*PyArray_DescrConverter_)(PyObject *, PyObject **); + bool (*PyArray_EquivTypes_)(PyObject *, PyObject *); +#ifdef PYBIND11_NUMPY_1_ONLY + int (*PyArray_GetArrayParamsFromObject_)(PyObject *, + PyObject *, + unsigned char, + PyObject **, + int *, + Py_intptr_t *, + PyObject **, + PyObject *); +#endif + PyObject *(*PyArray_Squeeze_)(PyObject *); + // Unused. Not removed because that affects ABI of the class. + int (*PyArray_SetBaseObject_)(PyObject *, PyObject *); + PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int); + PyObject *(*PyArray_Newshape_)(PyObject *, PyArray_Dims *, int); + PyObject *(*PyArray_View_)(PyObject *, PyObject *, PyObject *); + +private: + enum functions { + API_PyArray_GetNDArrayCFeatureVersion = 211, + API_PyArray_Type = 2, + API_PyArrayDescr_Type = 3, + API_PyVoidArrType_Type = 39, + API_PyArray_DescrFromType = 45, + API_PyArray_DescrFromScalar = 57, + API_PyArray_FromAny = 69, + API_PyArray_Resize = 80, + // CopyInto was slot 82 and 50 was effectively an alias. NumPy 2 removed 82. + API_PyArray_CopyInto = 50, + API_PyArray_NewCopy = 85, + API_PyArray_NewFromDescr = 94, + API_PyArray_DescrNewFromType = 96, + API_PyArray_Newshape = 135, + API_PyArray_Squeeze = 136, + API_PyArray_View = 137, + API_PyArray_DescrConverter = 174, + API_PyArray_EquivTypes = 182, +#ifdef PYBIND11_NUMPY_1_ONLY + API_PyArray_GetArrayParamsFromObject = 278, +#endif + API_PyArray_SetBaseObject = 282 + }; + + static npy_api lookup() { + module_ m = detail::import_numpy_core_submodule("multiarray"); + auto c = m.attr("_ARRAY_API"); + void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr); + if (api_ptr == nullptr) { + raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer."); + throw error_already_set(); + } + npy_api api; +#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func]; + DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion); + api.PyArray_RUNTIME_VERSION_ = api.PyArray_GetNDArrayCFeatureVersion_(); + if (api.PyArray_RUNTIME_VERSION_ < 0x7) { + pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0"); + } + DECL_NPY_API(PyArray_Type); + DECL_NPY_API(PyVoidArrType_Type); + DECL_NPY_API(PyArrayDescr_Type); + DECL_NPY_API(PyArray_DescrFromType); + DECL_NPY_API(PyArray_DescrFromScalar); + DECL_NPY_API(PyArray_FromAny); + DECL_NPY_API(PyArray_Resize); + DECL_NPY_API(PyArray_CopyInto); + DECL_NPY_API(PyArray_NewCopy); + DECL_NPY_API(PyArray_NewFromDescr); + DECL_NPY_API(PyArray_DescrNewFromType); + DECL_NPY_API(PyArray_Newshape); + DECL_NPY_API(PyArray_Squeeze); + DECL_NPY_API(PyArray_View); + DECL_NPY_API(PyArray_DescrConverter); + DECL_NPY_API(PyArray_EquivTypes); +#ifdef PYBIND11_NUMPY_1_ONLY + DECL_NPY_API(PyArray_GetArrayParamsFromObject); +#endif + DECL_NPY_API(PyArray_SetBaseObject); + +#undef DECL_NPY_API + return api; + } +}; + +inline PyArray_Proxy *array_proxy(void *ptr) { return reinterpret_cast(ptr); } + +inline const PyArray_Proxy *array_proxy(const void *ptr) { + return reinterpret_cast(ptr); +} + +inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) { + return reinterpret_cast(ptr); +} + +inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) { + return reinterpret_cast(ptr); +} + +inline const PyArrayDescr1_Proxy *array_descriptor1_proxy(const PyObject *ptr) { + return reinterpret_cast(ptr); +} + +inline const PyArrayDescr2_Proxy *array_descriptor2_proxy(const PyObject *ptr) { + return reinterpret_cast(ptr); +} + +inline bool check_flags(const void *ptr, int flag) { + return (flag == (array_proxy(ptr)->flags & flag)); +} + +template +struct is_std_array : std::false_type {}; +template +struct is_std_array> : std::true_type {}; +template +struct is_complex : std::false_type {}; +template +struct is_complex> : std::true_type {}; + +template +struct array_info_scalar { + using type = T; + static constexpr bool is_array = false; + static constexpr bool is_empty = false; + static constexpr auto extents = const_name(""); + static void append_extents(list & /* shape */) {} +}; +// Computes underlying type and a comma-separated list of extents for array +// types (any mix of std::array and built-in arrays). An array of char is +// treated as scalar because it gets special handling. +template +struct array_info : array_info_scalar {}; +template +struct array_info> { + using type = typename array_info::type; + static constexpr bool is_array = true; + static constexpr bool is_empty = (N == 0) || array_info::is_empty; + static constexpr size_t extent = N; + + // appends the extents to shape + static void append_extents(list &shape) { + shape.append(N); + array_info::append_extents(shape); + } + + static constexpr auto extents = const_name::is_array>( + ::pybind11::detail::concat(const_name(), array_info::extents), const_name()); +}; +// For numpy we have special handling for arrays of characters, so we don't include +// the size in the array extents. +template +struct array_info : array_info_scalar {}; +template +struct array_info> : array_info_scalar> {}; +template +struct array_info : array_info> {}; +template +using remove_all_extents_t = typename array_info::type; + +template +using is_pod_struct + = all_of, // since we're accessing directly in memory + // we need a standard layout type +#if defined(__GLIBCXX__) \ + && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623 \ + || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803) + // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after + // 5) don't implement is_trivially_copyable, so approximate it + std::is_trivially_destructible, + satisfies_any_of, +#else + std::is_trivially_copyable, +#endif + satisfies_none_of>; + +// Replacement for std::is_pod (deprecated in C++20) +template +using is_pod = all_of, std::is_trivial>; + +template +ssize_t byte_offset_unsafe(const Strides &) { + return 0; +} +template +ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) { + return i * strides[Dim] + byte_offset_unsafe(strides, index...); +} + +/** + * Proxy class providing unsafe, unchecked const access to array data. This is constructed through + * the `unchecked()` method of `array` or the `unchecked()` method of `array_t`. `Dims` + * will be -1 for dimensions determined at runtime. + */ +template +class unchecked_reference { +protected: + static constexpr bool Dynamic = Dims < 0; + const unsigned char *data_; + // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to + // make large performance gains on big, nested loops, but requires compile-time dimensions + conditional_t> shape_, strides_; + const ssize_t dims_; + + friend class pybind11::array; + // Constructor for compile-time dimensions: + template + unchecked_reference(const void *data, + const ssize_t *shape, + const ssize_t *strides, + enable_if_t) + : data_{reinterpret_cast(data)}, dims_{Dims} { + for (size_t i = 0; i < (size_t) dims_; i++) { + shape_[i] = shape[i]; + strides_[i] = strides[i]; + } + } + // Constructor for runtime dimensions: + template + unchecked_reference(const void *data, + const ssize_t *shape, + const ssize_t *strides, + enable_if_t dims) + : data_{reinterpret_cast(data)}, shape_{shape}, strides_{strides}, + dims_{dims} {} + +public: + /** + * Unchecked const reference access to data at the given indices. For a compile-time known + * number of dimensions, this requires the correct number of arguments; for run-time + * dimensionality, this is not checked (and so is up to the caller to use safely). + */ + template + const T &operator()(Ix... index) const { + static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic, + "Invalid number of indices for unchecked array reference"); + return *reinterpret_cast(data_ + + byte_offset_unsafe(strides_, ssize_t(index)...)); + } + /** + * Unchecked const reference access to data; this operator only participates if the reference + * is to a 1-dimensional array. When present, this is exactly equivalent to `obj(index)`. + */ + template > + const T &operator[](ssize_t index) const { + return operator()(index); + } + + /// Pointer access to the data at the given indices. + template + const T *data(Ix... ix) const { + return &operator()(ssize_t(ix)...); + } + + /// Returns the item size, i.e. sizeof(T) + constexpr static ssize_t itemsize() { return sizeof(T); } + + /// Returns the shape (i.e. size) of dimension `dim` + ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; } + + /// Returns the number of dimensions of the array + ssize_t ndim() const { return dims_; } + + /// Returns the total number of elements in the referenced array, i.e. the product of the + /// shapes + template + enable_if_t size() const { + return std::accumulate( + shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies()); + } + template + enable_if_t size() const { + return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies()); + } + + /// Returns the total number of bytes used by the referenced data. Note that the actual span + /// in memory may be larger if the referenced array has non-contiguous strides (e.g. for a + /// slice). + ssize_t nbytes() const { return size() * itemsize(); } +}; + +template +class unchecked_mutable_reference : public unchecked_reference { + friend class pybind11::array; + using ConstBase = unchecked_reference; + using ConstBase::ConstBase; + using ConstBase::Dynamic; + +public: + // Bring in const-qualified versions from base class + using ConstBase::operator(); + using ConstBase::operator[]; + + /// Mutable, unchecked access to data at the given indices. + template + T &operator()(Ix... index) { + static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic, + "Invalid number of indices for unchecked array reference"); + return const_cast(ConstBase::operator()(index...)); + } + /** + * Mutable, unchecked access data at the given index; this operator only participates if the + * reference is to a 1-dimensional array (or has runtime dimensions). When present, this is + * exactly equivalent to `obj(index)`. + */ + template > + T &operator[](ssize_t index) { + return operator()(index); + } + + /// Mutable pointer access to the data at the given indices. + template + T *mutable_data(Ix... ix) { + return &operator()(ssize_t(ix)...); + } +}; + +template +struct type_caster> { + static_assert(Dim == 0 && Dim > 0 /* always fail */, + "unchecked array proxy object is not castable"); +}; +template +struct type_caster> + : type_caster> {}; + +PYBIND11_NAMESPACE_END(detail) + +class dtype : public object { +public: + PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_) + + explicit dtype(const buffer_info &info) { + dtype descr(_dtype_from_pep3118()(pybind11::str(info.format))); + // If info.itemsize == 0, use the value calculated from the format string + m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize()) + .release() + .ptr(); + } + + explicit dtype(const pybind11::str &format) : dtype(from_args(format)) {} + + explicit dtype(const std::string &format) : dtype(pybind11::str(format)) {} + + explicit dtype(const char *format) : dtype(pybind11::str(format)) {} + + dtype(list names, list formats, list offsets, ssize_t itemsize) { + dict args; + args["names"] = std::move(names); + args["formats"] = std::move(formats); + args["offsets"] = std::move(offsets); + args["itemsize"] = pybind11::int_(itemsize); + m_ptr = from_args(args).release().ptr(); + } + + /// Return dtype for the given typenum (one of the NPY_TYPES). + /// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_DescrFromType + explicit dtype(int typenum) + : object(detail::npy_api::get().PyArray_DescrFromType_(typenum), stolen_t{}) { + if (m_ptr == nullptr) { + throw error_already_set(); + } + } + + /// This is essentially the same as calling numpy.dtype(args) in Python. + static dtype from_args(const object &args) { + PyObject *ptr = nullptr; + if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr) { + throw error_already_set(); + } + return reinterpret_steal(ptr); + } + + /// Return dtype associated with a C++ type. + template + static dtype of() { + return detail::npy_format_descriptor::type>::dtype(); + } + + /// Size of the data type in bytes. +#ifdef PYBIND11_NUMPY_1_ONLY + ssize_t itemsize() const { return detail::array_descriptor_proxy(m_ptr)->elsize; } +#else + ssize_t itemsize() const { + if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) { + return detail::array_descriptor1_proxy(m_ptr)->elsize; + } + return detail::array_descriptor2_proxy(m_ptr)->elsize; + } +#endif + + /// Returns true for structured data types. +#ifdef PYBIND11_NUMPY_1_ONLY + bool has_fields() const { return detail::array_descriptor_proxy(m_ptr)->names != nullptr; } +#else + bool has_fields() const { + if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) { + return detail::array_descriptor1_proxy(m_ptr)->names != nullptr; + } + const auto *proxy = detail::array_descriptor2_proxy(m_ptr); + if (proxy->type_num < 0 || proxy->type_num >= 2056) { + return false; + } + return proxy->names != nullptr; + } +#endif + + /// Single-character code for dtype's kind. + /// For example, floating point types are 'f' and integral types are 'i'. + char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; } + + /// Single-character for dtype's type. + /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'. + char char_() const { + // Note: The signature, `dtype::char_` follows the naming of NumPy's + // public Python API (i.e., ``dtype.char``), rather than its internal + // C API (``PyArray_Descr::type``). + return detail::array_descriptor_proxy(m_ptr)->type; + } + + /// type number of dtype. + int num() const { + // Note: The signature, `dtype::num` follows the naming of NumPy's public + // Python API (i.e., ``dtype.num``), rather than its internal + // C API (``PyArray_Descr::type_num``). + return detail::array_descriptor_proxy(m_ptr)->type_num; + } + + /// Single character for byteorder + char byteorder() const { return detail::array_descriptor_proxy(m_ptr)->byteorder; } + +/// Alignment of the data type +#ifdef PYBIND11_NUMPY_1_ONLY + int alignment() const { return detail::array_descriptor_proxy(m_ptr)->alignment; } +#else + ssize_t alignment() const { + if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) { + return detail::array_descriptor1_proxy(m_ptr)->alignment; + } + return detail::array_descriptor2_proxy(m_ptr)->alignment; + } +#endif + +/// Flags for the array descriptor +#ifdef PYBIND11_NUMPY_1_ONLY + char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; } +#else + std::uint64_t flags() const { + if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) { + return (unsigned char) detail::array_descriptor1_proxy(m_ptr)->flags; + } + return detail::array_descriptor2_proxy(m_ptr)->flags; + } +#endif + +private: + static object &_dtype_from_pep3118() { + PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; + return storage + .call_once_and_store_result([]() { + return detail::import_numpy_core_submodule("_internal") + .attr("_dtype_from_pep3118"); + }) + .get_stored(); + } + + dtype strip_padding(ssize_t itemsize) { + // Recursively strip all void fields with empty names that are generated for + // padding fields (as of NumPy v1.11). + if (!has_fields()) { + return *this; + } + + struct field_descr { + pybind11::str name; + object format; + pybind11::int_ offset; + field_descr(pybind11::str &&name, object &&format, pybind11::int_ &&offset) + : name{std::move(name)}, format{std::move(format)}, offset{std::move(offset)} {}; + }; + auto field_dict = attr("fields").cast(); + std::vector field_descriptors; + field_descriptors.reserve(field_dict.size()); + + for (auto field : field_dict.attr("items")()) { + auto spec = field.cast(); + auto name = spec[0].cast(); + auto spec_fo = spec[1].cast(); + auto format = spec_fo[0].cast(); + auto offset = spec_fo[1].cast(); + if ((len(name) == 0u) && format.kind() == 'V') { + continue; + } + field_descriptors.emplace_back( + std::move(name), format.strip_padding(format.itemsize()), std::move(offset)); + } + + std::sort(field_descriptors.begin(), + field_descriptors.end(), + [](const field_descr &a, const field_descr &b) { + return a.offset.cast() < b.offset.cast(); + }); + + list names, formats, offsets; + for (auto &descr : field_descriptors) { + names.append(std::move(descr.name)); + formats.append(std::move(descr.format)); + offsets.append(std::move(descr.offset)); + } + return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize); + } +}; + +class array : public buffer { +public: + PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array) + + enum { + c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_, + f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_, + forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_ + }; + + array() : array(0, static_cast(nullptr)) {} + + using ShapeContainer = detail::any_container; + using StridesContainer = detail::any_container; + + // Constructs an array taking shape/strides from arbitrary container types + array(const pybind11::dtype &dt, + ShapeContainer shape, + StridesContainer strides, + const void *ptr = nullptr, + handle base = handle()) { + + if (strides->empty()) { + *strides = detail::c_strides(*shape, dt.itemsize()); + } + + auto ndim = shape->size(); + if (ndim != strides->size()) { + pybind11_fail("NumPy: shape ndim doesn't match strides ndim"); + } + auto descr = dt; + + int flags = 0; + if (base && ptr) { + if (isinstance(base)) { + /* Copy flags from base (except ownership bit) */ + flags = reinterpret_borrow(base).flags() + & ~detail::npy_api::NPY_ARRAY_OWNDATA_; + } else { + /* Writable by default, easy to downgrade later on if needed */ + flags = detail::npy_api::NPY_ARRAY_WRITEABLE_; + } + } + + auto &api = detail::npy_api::get(); + auto tmp = reinterpret_steal(api.PyArray_NewFromDescr_( + api.PyArray_Type_, + descr.release().ptr(), + (int) ndim, + // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1) + reinterpret_cast(shape->data()), + reinterpret_cast(strides->data()), + const_cast(ptr), + flags, + nullptr)); + if (!tmp) { + throw error_already_set(); + } + if (ptr) { + if (base) { + api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr()); + } else { + tmp = reinterpret_steal( + api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */)); + } + } + m_ptr = tmp.release().ptr(); + } + + array(const pybind11::dtype &dt, + ShapeContainer shape, + const void *ptr = nullptr, + handle base = handle()) + : array(dt, std::move(shape), {}, ptr, base) {} + + template ::value && !std::is_same::value>> + array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle()) + : array(dt, {{count}}, ptr, base) {} + + template + array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle()) + : array(pybind11::dtype::of(), + std::move(shape), + std::move(strides), + reinterpret_cast(ptr), + base) {} + + template + array(ShapeContainer shape, const T *ptr, handle base = handle()) + : array(std::move(shape), {}, ptr, base) {} + + template + explicit array(ssize_t count, const T *ptr, handle base = handle()) + : array({count}, {}, ptr, base) {} + + explicit array(const buffer_info &info, handle base = handle()) + : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) {} + + /// Array descriptor (dtype) + pybind11::dtype dtype() const { + return reinterpret_borrow(detail::array_proxy(m_ptr)->descr); + } + + /// Total number of elements + ssize_t size() const { + return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies()); + } + + /// Byte size of a single element + ssize_t itemsize() const { return dtype().itemsize(); } + + /// Total number of bytes + ssize_t nbytes() const { return size() * itemsize(); } + + /// Number of dimensions + ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; } + + /// Base object + object base() const { return reinterpret_borrow(detail::array_proxy(m_ptr)->base); } + + /// Dimensions of the array + const ssize_t *shape() const { return detail::array_proxy(m_ptr)->dimensions; } + + /// Dimension along a given axis + ssize_t shape(ssize_t dim) const { + if (dim >= ndim()) { + fail_dim_check(dim, "invalid axis"); + } + return shape()[dim]; + } + + /// Strides of the array + const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; } + + /// Stride along a given axis + ssize_t strides(ssize_t dim) const { + if (dim >= ndim()) { + fail_dim_check(dim, "invalid axis"); + } + return strides()[dim]; + } + + /// Return the NumPy array flags + int flags() const { return detail::array_proxy(m_ptr)->flags; } + + /// If set, the array is writeable (otherwise the buffer is read-only) + bool writeable() const { + return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_); + } + + /// If set, the array owns the data (will be freed when the array is deleted) + bool owndata() const { + return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_); + } + + /// Pointer to the contained data. If index is not provided, points to the + /// beginning of the buffer. May throw if the index would lead to out of bounds access. + template + const void *data(Ix... index) const { + return static_cast(detail::array_proxy(m_ptr)->data + offset_at(index...)); + } + + /// Mutable pointer to the contained data. If index is not provided, points to the + /// beginning of the buffer. May throw if the index would lead to out of bounds access. + /// May throw if the array is not writeable. + template + void *mutable_data(Ix... index) { + check_writeable(); + return static_cast(detail::array_proxy(m_ptr)->data + offset_at(index...)); + } + + /// Byte offset from beginning of the array to a given index (full or partial). + /// May throw if the index would lead to out of bounds access. + template + ssize_t offset_at(Ix... index) const { + if ((ssize_t) sizeof...(index) > ndim()) { + fail_dim_check(sizeof...(index), "too many indices for an array"); + } + return byte_offset(ssize_t(index)...); + } + + ssize_t offset_at() const { return 0; } + + /// Item count from beginning of the array to a given index (full or partial). + /// May throw if the index would lead to out of bounds access. + template + ssize_t index_at(Ix... index) const { + return offset_at(index...) / itemsize(); + } + + /** + * Returns a proxy object that provides access to the array's data without bounds or + * dimensionality checking. Will throw if the array is missing the `writeable` flag. Use with + * care: the array must not be destroyed or reshaped for the duration of the returned object, + * and the caller must take care not to access invalid dimensions or dimension indices. + */ + template + detail::unchecked_mutable_reference mutable_unchecked() & { + if (Dims >= 0 && ndim() != Dims) { + throw std::domain_error("array has incorrect number of dimensions: " + + std::to_string(ndim()) + "; expected " + + std::to_string(Dims)); + } + return detail::unchecked_mutable_reference( + mutable_data(), shape(), strides(), ndim()); + } + + /** + * Returns a proxy object that provides const access to the array's data without bounds or + * dimensionality checking. Unlike `mutable_unchecked()`, this does not require that the + * underlying array have the `writable` flag. Use with care: the array must not be destroyed + * or reshaped for the duration of the returned object, and the caller must take care not to + * access invalid dimensions or dimension indices. + */ + template + detail::unchecked_reference unchecked() const & { + if (Dims >= 0 && ndim() != Dims) { + throw std::domain_error("array has incorrect number of dimensions: " + + std::to_string(ndim()) + "; expected " + + std::to_string(Dims)); + } + return detail::unchecked_reference(data(), shape(), strides(), ndim()); + } + + /// Return a new view with all of the dimensions of length 1 removed + array squeeze() { + auto &api = detail::npy_api::get(); + return reinterpret_steal(api.PyArray_Squeeze_(m_ptr)); + } + + /// Resize array to given shape + /// If refcheck is true and more that one reference exist to this array + /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change + void resize(ShapeContainer new_shape, bool refcheck = true) { + detail::npy_api::PyArray_Dims d + = {// Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1) + reinterpret_cast(new_shape->data()), + int(new_shape->size())}; + // try to resize, set ordering param to -1 cause it's not used anyway + auto new_array = reinterpret_steal( + detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)); + if (!new_array) { + throw error_already_set(); + } + if (isinstance(new_array)) { + *this = std::move(new_array); + } + } + + /// Optional `order` parameter omitted, to be added as needed. + array reshape(ShapeContainer new_shape) { + detail::npy_api::PyArray_Dims d + = {reinterpret_cast(new_shape->data()), int(new_shape->size())}; + auto new_array + = reinterpret_steal(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0)); + if (!new_array) { + throw error_already_set(); + } + return new_array; + } + + /// Create a view of an array in a different data type. + /// This function may fundamentally reinterpret the data in the array. + /// It is the responsibility of the caller to ensure that this is safe. + /// Only supports the `dtype` argument, the `type` argument is omitted, + /// to be added as needed. + array view(const std::string &dtype) { + auto &api = detail::npy_api::get(); + auto new_view = reinterpret_steal(api.PyArray_View_( + m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr)); + if (!new_view) { + throw error_already_set(); + } + return new_view; + } + + /// Ensure that the argument is a NumPy array + /// In case of an error, nullptr is returned and the Python error is cleared. + static array ensure(handle h, int ExtraFlags = 0) { + auto result = reinterpret_steal(raw_array(h.ptr(), ExtraFlags)); + if (!result) { + PyErr_Clear(); + } + return result; + } + +protected: + template + friend struct detail::npy_format_descriptor; + + void fail_dim_check(ssize_t dim, const std::string &msg) const { + throw index_error(msg + ": " + std::to_string(dim) + " (ndim = " + std::to_string(ndim()) + + ')'); + } + + template + ssize_t byte_offset(Ix... index) const { + check_dimensions(index...); + return detail::byte_offset_unsafe(strides(), ssize_t(index)...); + } + + void check_writeable() const { + if (!writeable()) { + throw std::domain_error("array is not writeable"); + } + } + + template + void check_dimensions(Ix... index) const { + check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...); + } + + void check_dimensions_impl(ssize_t, const ssize_t *) const {} + + template + void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i, Ix... index) const { + if (i >= *shape) { + throw index_error(std::string("index ") + std::to_string(i) + + " is out of bounds for axis " + std::to_string(axis) + + " with size " + std::to_string(*shape)); + } + check_dimensions_impl(axis + 1, shape + 1, index...); + } + + /// Create array from any object -- always returns a new reference + static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) { + if (ptr == nullptr) { + set_error(PyExc_ValueError, "cannot create a pybind11::array from a nullptr"); + return nullptr; + } + return detail::npy_api::get().PyArray_FromAny_( + ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr); + } +}; + +template +class array_t : public array { +private: + struct private_ctor {}; + // Delegating constructor needed when both moving and accessing in the same constructor + array_t(private_ctor, + ShapeContainer &&shape, + StridesContainer &&strides, + const T *ptr, + handle base) + : array(std::move(shape), std::move(strides), ptr, base) {} + +public: + static_assert(!detail::array_info::is_array, "Array types cannot be used with array_t"); + + using value_type = T; + + array_t() : array(0, static_cast(nullptr)) {} + array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {} + array_t(handle h, stolen_t) : array(h, stolen_t{}) {} + + PYBIND11_DEPRECATED("Use array_t::ensure() instead") + array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) { + if (!m_ptr) { + PyErr_Clear(); + } + if (!is_borrowed) { + Py_XDECREF(h.ptr()); + } + } + + // NOLINTNEXTLINE(google-explicit-constructor) + array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) { + if (!m_ptr) { + throw error_already_set(); + } + } + + explicit array_t(const buffer_info &info, handle base = handle()) : array(info, base) {} + + array_t(ShapeContainer shape, + StridesContainer strides, + const T *ptr = nullptr, + handle base = handle()) + : array(std::move(shape), std::move(strides), ptr, base) {} + + explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle()) + : array_t(private_ctor{}, + std::move(shape), + (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize()) + : detail::c_strides(*shape, itemsize()), + ptr, + base) {} + + explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle()) + : array({count}, {}, ptr, base) {} + + constexpr ssize_t itemsize() const { return sizeof(T); } + + template + ssize_t index_at(Ix... index) const { + return offset_at(index...) / itemsize(); + } + + template + const T *data(Ix... index) const { + return static_cast(array::data(index...)); + } + + template + T *mutable_data(Ix... index) { + return static_cast(array::mutable_data(index...)); + } + + // Reference to element at a given index + template + const T &at(Ix... index) const { + if ((ssize_t) sizeof...(index) != ndim()) { + fail_dim_check(sizeof...(index), "index dimension mismatch"); + } + return *(static_cast(array::data()) + + byte_offset(ssize_t(index)...) / itemsize()); + } + + // Mutable reference to element at a given index + template + T &mutable_at(Ix... index) { + if ((ssize_t) sizeof...(index) != ndim()) { + fail_dim_check(sizeof...(index), "index dimension mismatch"); + } + return *(static_cast(array::mutable_data()) + + byte_offset(ssize_t(index)...) / itemsize()); + } + + /** + * Returns a proxy object that provides access to the array's data without bounds or + * dimensionality checking. Will throw if the array is missing the `writeable` flag. Use with + * care: the array must not be destroyed or reshaped for the duration of the returned object, + * and the caller must take care not to access invalid dimensions or dimension indices. + */ + template + detail::unchecked_mutable_reference mutable_unchecked() & { + return array::mutable_unchecked(); + } + + /** + * Returns a proxy object that provides const access to the array's data without bounds or + * dimensionality checking. Unlike `mutable_unchecked()`, this does not require that the + * underlying array have the `writable` flag. Use with care: the array must not be destroyed + * or reshaped for the duration of the returned object, and the caller must take care not to + * access invalid dimensions or dimension indices. + */ + template + detail::unchecked_reference unchecked() const & { + return array::unchecked(); + } + + /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert + /// it). In case of an error, nullptr is returned and the Python error is cleared. + static array_t ensure(handle h) { + auto result = reinterpret_steal(raw_array_t(h.ptr())); + if (!result) { + PyErr_Clear(); + } + return result; + } + + static bool check_(handle h) { + const auto &api = detail::npy_api::get(); + return api.PyArray_Check_(h.ptr()) + && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, + dtype::of().ptr()) + && detail::check_flags(h.ptr(), ExtraFlags & (array::c_style | array::f_style)); + } + +protected: + /// Create array from any object -- always returns a new reference + static PyObject *raw_array_t(PyObject *ptr) { + if (ptr == nullptr) { + set_error(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr"); + return nullptr; + } + return detail::npy_api::get().PyArray_FromAny_(ptr, + dtype::of().release().ptr(), + 0, + 0, + detail::npy_api::NPY_ARRAY_ENSUREARRAY_ + | ExtraFlags, + nullptr); + } +}; + +template +struct format_descriptor::value>> { + static std::string format() { + return detail::npy_format_descriptor::type>::format(); + } +}; + +template +struct format_descriptor { + static std::string format() { return std::to_string(N) + 's'; } +}; +template +struct format_descriptor> { + static std::string format() { return std::to_string(N) + 's'; } +}; + +template +struct format_descriptor::value>> { + static std::string format() { + return format_descriptor< + typename std::remove_cv::type>::type>::format(); + } +}; + +template +struct format_descriptor::is_array>> { + static std::string format() { + using namespace detail; + static constexpr auto extents = const_name("(") + array_info::extents + const_name(")"); + return extents.text + format_descriptor>::format(); + } +}; + +PYBIND11_NAMESPACE_BEGIN(detail) +template +struct pyobject_caster> { + using type = array_t; + + bool load(handle src, bool convert) { + if (!convert && !type::check_(src)) { + return false; + } + value = type::ensure(src); + return static_cast(value); + } + + static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { + return src.inc_ref(); + } + PYBIND11_TYPE_CASTER(type, handle_type_name::name); +}; + +template +struct compare_buffer_info::value>> { + static bool compare(const buffer_info &b) { + return npy_api::get().PyArray_EquivTypes_(dtype::of().ptr(), dtype(b).ptr()); + } +}; + +template +struct npy_format_descriptor_name; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = const_name::value>( + const_name("bool"), + const_name::value>("numpy.int", "numpy.uint") + + const_name()); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = const_name < std::is_same::value + || std::is_same::value + || std::is_same::value + || std::is_same::value + > (const_name("numpy.float") + const_name(), + const_name("numpy.longdouble")); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = const_name < std::is_same::value + || std::is_same::value + || std::is_same::value + || std::is_same::value + > (const_name("numpy.complex") + + const_name(), + const_name("numpy.longcomplex")); +}; + +template +struct npy_format_descriptor< + T, + enable_if_t::value>> + : npy_format_descriptor_name { +private: + // NB: the order here must match the one in common.h + constexpr static const int values[15] = {npy_api::NPY_BOOL_, + npy_api::NPY_BYTE_, + npy_api::NPY_UBYTE_, + npy_api::NPY_INT16_, + npy_api::NPY_UINT16_, + npy_api::NPY_INT32_, + npy_api::NPY_UINT32_, + npy_api::NPY_INT64_, + npy_api::NPY_UINT64_, + npy_api::NPY_FLOAT_, + npy_api::NPY_DOUBLE_, + npy_api::NPY_LONGDOUBLE_, + npy_api::NPY_CFLOAT_, + npy_api::NPY_CDOUBLE_, + npy_api::NPY_CLONGDOUBLE_}; + +public: + static constexpr int value = values[detail::is_fmt_numeric::index]; + + static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); } +}; + +template +struct npy_format_descriptor::value>> { + static constexpr auto name = const_name("object"); + + static constexpr int value = npy_api::NPY_OBJECT_; + + static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); } +}; + +#define PYBIND11_DECL_CHAR_FMT \ + static constexpr auto name = const_name("S") + const_name(); \ + static pybind11::dtype dtype() { \ + return pybind11::dtype(std::string("S") + std::to_string(N)); \ + } +template +struct npy_format_descriptor { + PYBIND11_DECL_CHAR_FMT +}; +template +struct npy_format_descriptor> { + PYBIND11_DECL_CHAR_FMT +}; +#undef PYBIND11_DECL_CHAR_FMT + +template +struct npy_format_descriptor::is_array>> { +private: + using base_descr = npy_format_descriptor::type>; + +public: + static_assert(!array_info::is_empty, "Zero-sized arrays are not supported"); + + static constexpr auto name + = const_name("(") + array_info::extents + const_name(")") + base_descr::name; + static pybind11::dtype dtype() { + list shape; + array_info::append_extents(shape); + return pybind11::dtype::from_args( + pybind11::make_tuple(base_descr::dtype(), std::move(shape))); + } +}; + +template +struct npy_format_descriptor::value>> { +private: + using base_descr = npy_format_descriptor::type>; + +public: + static constexpr auto name = base_descr::name; + static pybind11::dtype dtype() { return base_descr::dtype(); } +}; + +struct field_descriptor { + const char *name; + ssize_t offset; + ssize_t size; + std::string format; + dtype descr; +}; + +PYBIND11_NOINLINE void register_structured_dtype(any_container fields, + const std::type_info &tinfo, + ssize_t itemsize, + bool (*direct_converter)(PyObject *, void *&)) { + + auto &numpy_internals = get_numpy_internals(); + if (numpy_internals.get_type_info(tinfo, false)) { + pybind11_fail("NumPy: dtype is already registered"); + } + + // Use ordered fields because order matters as of NumPy 1.14: + // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays + std::vector ordered_fields(std::move(fields)); + std::sort( + ordered_fields.begin(), + ordered_fields.end(), + [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; }); + + list names, formats, offsets; + for (auto &field : ordered_fields) { + if (!field.descr) { + pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ " + + tinfo.name()); + } + names.append(pybind11::str(field.name)); + formats.append(field.descr); + offsets.append(pybind11::int_(field.offset)); + } + auto *dtype_ptr + = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize) + .release() + .ptr(); + + // There is an existing bug in NumPy (as of v1.11): trailing bytes are + // not encoded explicitly into the format string. This will supposedly + // get fixed in v1.12; for further details, see these: + // - https://github.com/numpy/numpy/issues/7797 + // - https://github.com/numpy/numpy/pull/7798 + // Because of this, we won't use numpy's logic to generate buffer format + // strings and will just do it ourselves. + ssize_t offset = 0; + std::ostringstream oss; + // mark the structure as unaligned with '^', because numpy and C++ don't + // always agree about alignment (particularly for complex), and we're + // explicitly listing all our padding. This depends on none of the fields + // overriding the endianness. Putting the ^ in front of individual fields + // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049 + oss << "^T{"; + for (auto &field : ordered_fields) { + if (field.offset > offset) { + oss << (field.offset - offset) << 'x'; + } + oss << field.format << ':' << field.name << ':'; + offset = field.offset + field.size; + } + if (itemsize > offset) { + oss << (itemsize - offset) << 'x'; + } + oss << '}'; + auto format_str = oss.str(); + + // Smoke test: verify that NumPy properly parses our buffer format string + auto &api = npy_api::get(); + auto arr = array(buffer_info(nullptr, itemsize, format_str, 1)); + if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) { + pybind11_fail("NumPy: invalid buffer descriptor!"); + } + + auto tindex = std::type_index(tinfo); + numpy_internals.registered_dtypes[tindex] = {dtype_ptr, std::move(format_str)}; + with_internals([tindex, &direct_converter](internals &internals) { + internals.direct_conversions[tindex].push_back(direct_converter); + }); +} + +template +struct npy_format_descriptor { + static_assert(is_pod_struct::value, + "Attempt to use a non-POD or unimplemented POD type as a numpy dtype"); + + static constexpr auto name = make_caster::name; + + static pybind11::dtype dtype() { return reinterpret_borrow(dtype_ptr()); } + + static std::string format() { + static auto format_str = get_numpy_internals().get_type_info(true)->format_str; + return format_str; + } + + static void register_dtype(any_container fields) { + register_structured_dtype(std::move(fields), + typeid(typename std::remove_cv::type), + sizeof(T), + &direct_converter); + } + +private: + static PyObject *dtype_ptr() { + static PyObject *ptr = get_numpy_internals().get_type_info(true)->dtype_ptr; + return ptr; + } + + static bool direct_converter(PyObject *obj, void *&value) { + auto &api = npy_api::get(); + if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) { + return false; + } + if (auto descr = reinterpret_steal(api.PyArray_DescrFromScalar_(obj))) { + if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) { + value = ((PyVoidScalarObject_Proxy *) obj)->obval; + return true; + } + } + return false; + } +}; + +#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code) +# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void) 0) +# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void) 0) +#else + +# define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name) \ + ::pybind11::detail::field_descriptor { \ + Name, offsetof(T, Field), sizeof(decltype(std::declval().Field)), \ + ::pybind11::format_descriptor().Field)>::format(), \ + ::pybind11::detail::npy_format_descriptor< \ + decltype(std::declval().Field)>::dtype() \ + } + +// Extract name, offset and format descriptor for a struct field +# define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field) + +// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro +// (C) William Swanson, Paul Fultz +# define PYBIND11_EVAL0(...) __VA_ARGS__ +# define PYBIND11_EVAL1(...) PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__))) +# define PYBIND11_EVAL2(...) PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__))) +# define PYBIND11_EVAL3(...) PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__))) +# define PYBIND11_EVAL4(...) PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__))) +# define PYBIND11_EVAL(...) PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__))) +# define PYBIND11_MAP_END(...) +# define PYBIND11_MAP_OUT +# define PYBIND11_MAP_COMMA , +# define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END +# define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT +# define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0) +# define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next) +# if defined(_MSC_VER) \ + && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround +# define PYBIND11_MAP_LIST_NEXT1(test, next) \ + PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)) +# else +# define PYBIND11_MAP_LIST_NEXT1(test, next) \ + PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0) +# endif +# define PYBIND11_MAP_LIST_NEXT(test, next) \ + PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next) +# define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \ + f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek, __VA_ARGS__) +# define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \ + f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek, __VA_ARGS__) +// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ... +# define PYBIND11_MAP_LIST(f, t, ...) \ + PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0)) + +# define PYBIND11_NUMPY_DTYPE(Type, ...) \ + ::pybind11::detail::npy_format_descriptor::register_dtype( \ + ::std::vector<::pybind11::detail::field_descriptor>{ \ + PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) + +# if defined(_MSC_VER) && !defined(__clang__) +# define PYBIND11_MAP2_LIST_NEXT1(test, next) \ + PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)) +# else +# define PYBIND11_MAP2_LIST_NEXT1(test, next) \ + PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0) +# endif +# define PYBIND11_MAP2_LIST_NEXT(test, next) \ + PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next) +# define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \ + f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek, __VA_ARGS__) +# define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \ + f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek, __VA_ARGS__) +// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ... +# define PYBIND11_MAP2_LIST(f, t, ...) \ + PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0)) + +# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \ + ::pybind11::detail::npy_format_descriptor::register_dtype( \ + ::std::vector<::pybind11::detail::field_descriptor>{ \ + PYBIND11_MAP2_LIST(PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) + +#endif // __CLION_IDE__ + +class common_iterator { +public: + using container_type = std::vector; + using value_type = container_type::value_type; + using size_type = container_type::size_type; + + common_iterator() : m_strides() {} + + common_iterator(void *ptr, const container_type &strides, const container_type &shape) + : p_ptr(reinterpret_cast(ptr)), m_strides(strides.size()) { + m_strides.back() = static_cast(strides.back()); + for (size_type i = m_strides.size() - 1; i != 0; --i) { + size_type j = i - 1; + auto s = static_cast(shape[i]); + m_strides[j] = strides[j] + m_strides[i] - strides[i] * s; + } + } + + void increment(size_type dim) { p_ptr += m_strides[dim]; } + + void *data() const { return p_ptr; } + +private: + char *p_ptr{nullptr}; + container_type m_strides; +}; + +template +class multi_array_iterator { +public: + using container_type = std::vector; + + multi_array_iterator(const std::array &buffers, const container_type &shape) + : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() { + + // Manual copy to avoid conversion warning if using std::copy + for (size_t i = 0; i < shape.size(); ++i) { + m_shape[i] = shape[i]; + } + + container_type strides(shape.size()); + for (size_t i = 0; i < N; ++i) { + init_common_iterator(buffers[i], shape, m_common_iterator[i], strides); + } + } + + multi_array_iterator &operator++() { + for (size_t j = m_index.size(); j != 0; --j) { + size_t i = j - 1; + if (++m_index[i] != m_shape[i]) { + increment_common_iterator(i); + break; + } + m_index[i] = 0; + } + return *this; + } + + template + T *data() const { + return reinterpret_cast(m_common_iterator[K].data()); + } + +private: + using common_iter = common_iterator; + + void init_common_iterator(const buffer_info &buffer, + const container_type &shape, + common_iter &iterator, + container_type &strides) { + auto buffer_shape_iter = buffer.shape.rbegin(); + auto buffer_strides_iter = buffer.strides.rbegin(); + auto shape_iter = shape.rbegin(); + auto strides_iter = strides.rbegin(); + + while (buffer_shape_iter != buffer.shape.rend()) { + if (*shape_iter == *buffer_shape_iter) { + *strides_iter = *buffer_strides_iter; + } else { + *strides_iter = 0; + } + + ++buffer_shape_iter; + ++buffer_strides_iter; + ++shape_iter; + ++strides_iter; + } + + std::fill(strides_iter, strides.rend(), 0); + iterator = common_iter(buffer.ptr, strides, shape); + } + + void increment_common_iterator(size_t dim) { + for (auto &iter : m_common_iterator) { + iter.increment(dim); + } + } + + container_type m_shape; + container_type m_index; + std::array m_common_iterator; +}; + +enum class broadcast_trivial { non_trivial, c_trivial, f_trivial }; + +// Populates the shape and number of dimensions for the set of buffers. Returns a +// broadcast_trivial enum value indicating whether the broadcast is "trivial"--that is, has each +// buffer being either a singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous +// (`f_trivial`) storage buffer; returns `non_trivial` otherwise. +template +broadcast_trivial +broadcast(const std::array &buffers, ssize_t &ndim, std::vector &shape) { + ndim = std::accumulate( + buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) { + return std::max(res, buf.ndim); + }); + + shape.clear(); + shape.resize((size_t) ndim, 1); + + // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 + // or the full size). + for (size_t i = 0; i < N; ++i) { + auto res_iter = shape.rbegin(); + auto end = buffers[i].shape.rend(); + for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; + ++shape_iter, ++res_iter) { + const auto &dim_size_in = *shape_iter; + auto &dim_size_out = *res_iter; + + // Each input dimension can either be 1 or `n`, but `n` values must match across + // buffers + if (dim_size_out == 1) { + dim_size_out = dim_size_in; + } else if (dim_size_in != 1 && dim_size_in != dim_size_out) { + pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!"); + } + } + } + + bool trivial_broadcast_c = true; + bool trivial_broadcast_f = true; + for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) { + if (buffers[i].size == 1) { + continue; + } + + // Require the same number of dimensions: + if (buffers[i].ndim != ndim) { + return broadcast_trivial::non_trivial; + } + + // Require all dimensions be full-size: + if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) { + return broadcast_trivial::non_trivial; + } + + // Check for C contiguity (but only if previous inputs were also C contiguous) + if (trivial_broadcast_c) { + ssize_t expect_stride = buffers[i].itemsize; + auto end = buffers[i].shape.crend(); + for (auto shape_iter = buffers[i].shape.crbegin(), + stride_iter = buffers[i].strides.crbegin(); + trivial_broadcast_c && shape_iter != end; + ++shape_iter, ++stride_iter) { + if (expect_stride == *stride_iter) { + expect_stride *= *shape_iter; + } else { + trivial_broadcast_c = false; + } + } + } + + // Check for Fortran contiguity (if previous inputs were also F contiguous) + if (trivial_broadcast_f) { + ssize_t expect_stride = buffers[i].itemsize; + auto end = buffers[i].shape.cend(); + for (auto shape_iter = buffers[i].shape.cbegin(), + stride_iter = buffers[i].strides.cbegin(); + trivial_broadcast_f && shape_iter != end; + ++shape_iter, ++stride_iter) { + if (expect_stride == *stride_iter) { + expect_stride *= *shape_iter; + } else { + trivial_broadcast_f = false; + } + } + } + } + + return trivial_broadcast_c ? broadcast_trivial::c_trivial + : trivial_broadcast_f ? broadcast_trivial::f_trivial + : broadcast_trivial::non_trivial; +} + +template +struct vectorize_arg { + static_assert(!std::is_rvalue_reference::value, + "Functions with rvalue reference arguments cannot be vectorized"); + // The wrapped function gets called with this type: + using call_type = remove_reference_t; + // Is this a vectorized argument? + static constexpr bool vectorize + = satisfies_any_of::value + && satisfies_none_of::value + && (!std::is_reference::value + || (std::is_lvalue_reference::value && std::is_const::value)); + // Accept this type: an array for vectorized types, otherwise the type as-is: + using type = conditional_t, array::forcecast>, T>; +}; + +// py::vectorize when a return type is present +template +struct vectorize_returned_array { + using Type = array_t; + + static Type create(broadcast_trivial trivial, const std::vector &shape) { + if (trivial == broadcast_trivial::f_trivial) { + return array_t(shape); + } + return array_t(shape); + } + + static Return *mutable_data(Type &array) { return array.mutable_data(); } + + static Return call(Func &f, Args &...args) { return f(args...); } + + static void call(Return *out, size_t i, Func &f, Args &...args) { out[i] = f(args...); } +}; + +// py::vectorize when a return type is not present +template +struct vectorize_returned_array { + using Type = none; + + static Type create(broadcast_trivial, const std::vector &) { return none(); } + + static void *mutable_data(Type &) { return nullptr; } + + static detail::void_type call(Func &f, Args &...args) { + f(args...); + return {}; + } + + static void call(void *, size_t, Func &f, Args &...args) { f(args...); } +}; + +template +struct vectorize_helper { + +// NVCC for some reason breaks if NVectorized is private +#ifdef __CUDACC__ +public: +#else +private: +#endif + + static constexpr size_t N = sizeof...(Args); + static constexpr size_t NVectorized = constexpr_sum(vectorize_arg::vectorize...); + static_assert( + NVectorized >= 1, + "pybind11::vectorize(...) requires a function with at least one vectorizable argument"); + +public: + template ::type>::value>> + explicit vectorize_helper(T &&f) : f(std::forward(f)) {} + + object operator()(typename vectorize_arg::type... args) { + return run(args..., + make_index_sequence(), + select_indices::vectorize...>(), + make_index_sequence()); + } + +private: + remove_reference_t f; + + // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling + // with "/permissive-" flag when arg_call_types is manually inlined. + using arg_call_types = std::tuple::call_type...>; + template + using param_n_t = typename std::tuple_element::type; + + using returned_array = vectorize_returned_array; + + // Runs a vectorized function given arguments tuple and three index sequences: + // - Index is the full set of 0 ... (N-1) argument indices; + // - VIndex is the subset of argument indices with vectorized parameters, letting us access + // vectorized arguments (anything not in this sequence is passed through) + // - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that + // we can store vectorized buffer_infos in an array (argument VIndex has its buffer at + // index BIndex in the array). + template + object run(typename vectorize_arg::type &...args, + index_sequence i_seq, + index_sequence vi_seq, + index_sequence bi_seq) { + + // Pointers to values the function was called with; the vectorized ones set here will start + // out as array_t pointers, but they will be changed them to T pointers before we make + // call the wrapped function. Non-vectorized pointers are left as-is. + std::array params{{reinterpret_cast(&args)...}}; + + // The array of `buffer_info`s of vectorized arguments: + std::array buffers{ + {reinterpret_cast(params[VIndex])->request()...}}; + + /* Determine dimensions parameters of output array */ + ssize_t nd = 0; + std::vector shape(0); + auto trivial = broadcast(buffers, nd, shape); + auto ndim = (size_t) nd; + + size_t size + = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies()); + + // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e. + // not wrapped in an array). + if (size == 1 && ndim == 0) { + PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr); + return cast( + returned_array::call(f, *reinterpret_cast *>(params[Index])...)); + } + + auto result = returned_array::create(trivial, shape); + + PYBIND11_WARNING_PUSH +#ifdef PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING + PYBIND11_WARNING_DISABLE_CLANG("-Wreturn-std-move") +#endif + + if (size == 0) { + return result; + } + + /* Call the function */ + auto *mutable_data = returned_array::mutable_data(result); + if (trivial == broadcast_trivial::non_trivial) { + apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq); + } else { + apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq); + } + + return result; + PYBIND11_WARNING_POP + } + + template + void apply_trivial(std::array &buffers, + std::array ¶ms, + Return *out, + size_t size, + index_sequence, + index_sequence, + index_sequence) { + + // Initialize an array of mutable byte references and sizes with references set to the + // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size + // (except for singletons, which get an increment of 0). + std::array, NVectorized> vecparams{ + {std::pair( + reinterpret_cast(params[VIndex] = buffers[BIndex].ptr), + buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t))...}}; + + for (size_t i = 0; i < size; ++i) { + returned_array::call( + out, i, f, *reinterpret_cast *>(params[Index])...); + for (auto &x : vecparams) { + x.first += x.second; + } + } + } + + template + void apply_broadcast(std::array &buffers, + std::array ¶ms, + Return *out, + size_t size, + const std::vector &output_shape, + index_sequence, + index_sequence, + index_sequence) { + + multi_array_iterator input_iter(buffers, output_shape); + + for (size_t i = 0; i < size; ++i, ++input_iter) { + PYBIND11_EXPAND_SIDE_EFFECTS((params[VIndex] = input_iter.template data())); + returned_array::call( + out, i, f, *reinterpret_cast *>(std::get(params))...); + } + } +}; + +template +vectorize_helper vectorize_extractor(const Func &f, Return (*)(Args...)) { + return detail::vectorize_helper(f); +} + +template +struct handle_type_name> { + static constexpr auto name + = const_name("numpy.ndarray[") + npy_format_descriptor::name + const_name("]"); +}; + +PYBIND11_NAMESPACE_END(detail) + +// Vanilla pointer vectorizer: +template +detail::vectorize_helper vectorize(Return (*f)(Args...)) { + return detail::vectorize_helper(f); +} + +// lambda vectorizer: +template ::value, int> = 0> +auto vectorize(Func &&f) + -> decltype(detail::vectorize_extractor(std::forward(f), + (detail::function_signature_t *) nullptr)) { + return detail::vectorize_extractor(std::forward(f), + (detail::function_signature_t *) nullptr); +} + +// Vectorize a class method (non-const): +template ())), + Return, + Class *, + Args...>> +Helper vectorize(Return (Class::*f)(Args...)) { + return Helper(std::mem_fn(f)); +} + +// Vectorize a class method (const): +template ())), + Return, + const Class *, + Args...>> +Helper vectorize(Return (Class::*f)(Args...) const) { + return Helper(std::mem_fn(f)); +} + +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h new file mode 100644 index 0000000000000000000000000000000000000000..c16a9ae5c2b076cc93671772de04e70c9075893d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h @@ -0,0 +1,124 @@ +// Copyright (c) 2021 The Pybind Development Team. +// All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#ifdef __has_include +# if defined(PYBIND11_CPP17) +# if __has_include() +# include +# define PYBIND11_HAS_FILESYSTEM 1 +# elif __has_include() +# include +# define PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM 1 +# endif +# endif +#endif + +#if !defined(PYBIND11_HAS_FILESYSTEM) && !defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM) \ + && !defined(PYBIND11_HAS_FILESYSTEM_IS_OPTIONAL) +# error \ + "Neither #include nor #include (__VA_ARGS__)) +#endif + +#if defined(PYBIND11_HAS_FILESYSTEM) || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM) +template +struct path_caster { + +private: + static PyObject *unicode_from_fs_native(const std::string &w) { +# if !defined(PYPY_VERSION) + return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size())); +# else + // PyPy mistakenly declares the first parameter as non-const. + return PyUnicode_DecodeFSDefaultAndSize(const_cast(w.c_str()), ssize_t(w.size())); +# endif + } + + static PyObject *unicode_from_fs_native(const std::wstring &w) { + return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size())); + } + +public: + static handle cast(const T &path, return_value_policy, handle) { + if (auto py_str = unicode_from_fs_native(path.native())) { + return module_::import("pathlib") + .attr("Path")(reinterpret_steal(py_str)) + .release(); + } + return nullptr; + } + + bool load(handle handle, bool) { + // PyUnicode_FSConverter and PyUnicode_FSDecoder normally take care of + // calling PyOS_FSPath themselves, but that's broken on PyPy (PyPy + // issue #3168) so we do it ourselves instead. + PyObject *buf = PyOS_FSPath(handle.ptr()); + if (!buf) { + PyErr_Clear(); + return false; + } + PyObject *native = nullptr; + if constexpr (std::is_same_v) { + if (PyUnicode_FSConverter(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native)) + != 0) { + if (auto *c_str = PyBytes_AsString(native)) { + // AsString returns a pointer to the internal buffer, which + // must not be free'd. + value = c_str; + } + } + } else if constexpr (std::is_same_v) { + if (PyUnicode_FSDecoder(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native)) + != 0) { + if (auto *c_str = PyUnicode_AsWideCharString(native, nullptr)) { + // AsWideCharString returns a new string that must be free'd. + value = c_str; // Copies the string. + PyMem_Free(c_str); + } + } + } + Py_XDECREF(native); + Py_DECREF(buf); + if (PyErr_Occurred()) { + PyErr_Clear(); + return false; + } + return true; + } + + PYBIND11_TYPE_CASTER(T, const_name("os.PathLike")); +}; + +#endif // PYBIND11_HAS_FILESYSTEM || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM) + +#if defined(PYBIND11_HAS_FILESYSTEM) +template <> +struct type_caster : public path_caster {}; +#elif defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM) +template <> +struct type_caster + : public path_caster {}; +#endif + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d8d94b11d1d627a604dace61a8b0bb546f77d7b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake @@ -0,0 +1,455 @@ +#[======================================================[.rst + +Adds the following targets:: + + pybind11::pybind11 - link to Python headers and pybind11::headers + pybind11::module - Adds module links + pybind11::embed - Adds embed links + pybind11::lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set) + pybind11::thin_lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set) + pybind11::python_link_helper - Adds link to Python libraries + pybind11::windows_extras - MSVC bigobj and mp for building multithreaded + pybind11::opt_size - avoid optimizations that increase code size + +Adds the following functions:: + + pybind11_strip(target) - strip target after building on linux/macOS + pybind11_find_import(module) - See if a module is installed. + +#]======================================================] + +# CMake 3.10 has an include_guard command, but we can't use that yet +# include_guard(global) (pre-CMake 3.10) +if(TARGET pybind11::pybind11) + return() +endif() + +# If we are in subdirectory mode, all IMPORTED targets must be GLOBAL. If we +# are in CONFIG mode, they should be "normal" targets instead. +# In CMake 3.11+ you can promote a target to global after you create it, +# which might be simpler than this check. +get_property( + is_config + TARGET pybind11::headers + PROPERTY IMPORTED) +if(NOT is_config) + set(optional_global GLOBAL) +endif() + +# If not run in Python mode, we still would like this to at least +# include pybind11's include directory: +set(pybind11_INCLUDE_DIRS + "${pybind11_INCLUDE_DIR}" + CACHE INTERNAL "Include directory for pybind11 (Python not requested)") + +if(CMAKE_CROSSCOMPILING AND PYBIND11_USE_CROSSCOMPILING) + set(_PYBIND11_CROSSCOMPILING + ON + CACHE INTERNAL "") +else() + set(_PYBIND11_CROSSCOMPILING + OFF + CACHE INTERNAL "") +endif() + +# --------------------- Shared targets ---------------------------- + +# Build an interface library target: +add_library(pybind11::pybind11 IMPORTED INTERFACE ${optional_global}) +set_property( + TARGET pybind11::pybind11 + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES pybind11::headers) + +# Build a module target: +add_library(pybind11::module IMPORTED INTERFACE ${optional_global}) +set_property( + TARGET pybind11::module + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11) + +# Build an embed library target: +add_library(pybind11::embed IMPORTED INTERFACE ${optional_global}) +set_property( + TARGET pybind11::embed + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11) + +# -------------- emscripten requires exceptions enabled ------------- +# _pybind11_no_exceptions is a private mechanism to disable this addition. +# Please open an issue if you need to use it; it will be removed if no one +# needs it. +if(CMAKE_SYSTEM_NAME MATCHES Emscripten AND NOT _pybind11_no_exceptions) + if(CMAKE_VERSION VERSION_LESS 3.13) + message(WARNING "CMake 3.13+ is required to build for Emscripten. Some flags will be missing") + else() + if(is_config) + set(_tmp_config_target pybind11::pybind11_headers) + else() + set(_tmp_config_target pybind11_headers) + endif() + + set_property( + TARGET ${_tmp_config_target} + APPEND + PROPERTY INTERFACE_LINK_OPTIONS -fexceptions) + set_property( + TARGET ${_tmp_config_target} + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS -fexceptions) + unset(_tmp_config_target) + endif() +endif() + +# --------------------------- link helper --------------------------- + +add_library(pybind11::python_link_helper IMPORTED INTERFACE ${optional_global}) + +if(CMAKE_VERSION VERSION_LESS 3.13) + # In CMake 3.11+, you can set INTERFACE properties via the normal methods, and + # this would be simpler. + set_property( + TARGET pybind11::python_link_helper + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES "$<$:-undefined dynamic_lookup>") +else() + # link_options was added in 3.13+ + # This is safer, because you are ensured the deduplication pass in CMake will not consider + # these separate and remove one but not the other. + set_property( + TARGET pybind11::python_link_helper + APPEND + PROPERTY INTERFACE_LINK_OPTIONS "$<$:LINKER:-undefined,dynamic_lookup>") +endif() + +# ------------------------ Windows extras ------------------------- + +add_library(pybind11::windows_extras IMPORTED INTERFACE ${optional_global}) + +if(MSVC) # That's also clang-cl + # /bigobj is needed for bigger binding projects due to the limit to 64k + # addressable sections + set_property( + TARGET pybind11::windows_extras + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS $<$:/bigobj>) + + # /MP enables multithreaded builds (relevant when there are many files) for MSVC + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # no Clang no Intel + if(CMAKE_VERSION VERSION_LESS 3.11) + set_property( + TARGET pybind11::windows_extras + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS $<$>:/MP>) + else() + # Only set these options for C++ files. This is important so that, for + # instance, projects that include other types of source files like CUDA + # .cu files don't get these options propagated to nvcc since that would + # cause the build to fail. + set_property( + TARGET pybind11::windows_extras + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS + $<$>:$<$:/MP>>) + endif() + endif() +endif() + +# ----------------------- Optimize binary size -------------------------- + +add_library(pybind11::opt_size IMPORTED INTERFACE ${optional_global}) + +if(MSVC) + set(PYBIND11_OPT_SIZE /Os) +else() + set(PYBIND11_OPT_SIZE -Os) +endif() + +set_property( + TARGET pybind11::opt_size + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS $<$:${PYBIND11_OPT_SIZE}> + $<$:${PYBIND11_OPT_SIZE}> + $<$:${PYBIND11_OPT_SIZE}>) + +# ----------------------- Legacy option -------------------------- + +# Warn or error if old variable name used +if(PYBIND11_CPP_STANDARD) + string(REGEX MATCH [[..$]] VAL "${PYBIND11_CPP_STANDARD}") + if(CMAKE_CXX_STANDARD) + if(NOT CMAKE_CXX_STANDARD STREQUAL VAL) + message(WARNING "CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} does not match " + "PYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}, " + "please remove PYBIND11_CPP_STANDARD from your cache") + endif() + else() + set(supported_standards 11 14 17 20) + if("${VAL}" IN_LIST supported_standards) + message(WARNING "USE -DCMAKE_CXX_STANDARD=${VAL} instead of PYBIND11_CPP_STANDARD") + set(CMAKE_CXX_STANDARD + ${VAL} + CACHE STRING "From PYBIND11_CPP_STANDARD") + else() + message(FATAL_ERROR "PYBIND11_CPP_STANDARD should be replaced with CMAKE_CXX_STANDARD " + "(last two chars: ${VAL} not understood as a valid CXX std)") + endif() + endif() +endif() + +# --------------------- Python specifics ------------------------- + +# CMake 3.27 removes the classic FindPythonInterp if CMP0148 is NEW +if(CMAKE_VERSION VERSION_LESS "3.27") + set(_pybind11_missing_old_python "OLD") +else() + cmake_policy(GET CMP0148 _pybind11_missing_old_python) +endif() + +# Check to see which Python mode we are in, new, old, or no python +if(PYBIND11_NOPYTHON) + set(_pybind11_nopython ON) + # We won't use new FindPython if PYBIND11_FINDPYTHON is defined and falselike + # Otherwise, we use if FindPythonLibs is missing or if FindPython was already used +elseif( + (NOT DEFINED PYBIND11_FINDPYTHON OR PYBIND11_FINDPYTHON) + AND (_pybind11_missing_old_python STREQUAL "NEW" + OR PYBIND11_FINDPYTHON + OR Python_FOUND + OR Python3_FOUND + )) + + # New mode + include("${CMAKE_CURRENT_LIST_DIR}/pybind11NewTools.cmake") + +else() + + # Classic mode + include("${CMAKE_CURRENT_LIST_DIR}/pybind11Tools.cmake") + +endif() + +# --------------------- pybind11_find_import ------------------------------- + +if(NOT _pybind11_nopython AND NOT _PYBIND11_CROSSCOMPILING) + # Check to see if modules are importable. Use REQUIRED to force an error if + # one of the modules is not found. _FOUND will be set if the + # package was found (underscores replace dashes if present). QUIET will hide + # the found message, and VERSION will require a minimum version. A successful + # find will cache the result. + function(pybind11_find_import PYPI_NAME) + # CMake variables need underscores (PyPI doesn't care) + string(REPLACE "-" "_" NORM_PYPI_NAME "${PYPI_NAME}") + + # Return if found previously + if(${NORM_PYPI_NAME}_FOUND) + return() + endif() + + set(options "REQUIRED;QUIET") + set(oneValueArgs "VERSION") + cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "" ${ARGN}) + + if(ARG_REQUIRED) + set(status_level FATAL_ERROR) + else() + set(status_level WARNING) + endif() + + execute_process( + COMMAND + ${${_Python}_EXECUTABLE} -c " +try: + from importlib.metadata import version +except ImportError: + from pkg_resources import get_distribution + def version(s): + return get_distribution(s).version +print(version('${PYPI_NAME}')) + " + RESULT_VARIABLE RESULT_PRESENT + OUTPUT_VARIABLE PKG_VERSION + ERROR_QUIET) + + string(STRIP "${PKG_VERSION}" PKG_VERSION) + + # If a result is present, this failed + if(RESULT_PRESENT) + set(${NORM_PYPI_NAME}_FOUND + ${NORM_PYPI_NAME}-NOTFOUND + CACHE INTERNAL "") + # Always warn or error + message( + ${status_level} + "Missing: ${PYPI_NAME} ${ARG_VERSION}\nTry: ${${_Python}_EXECUTABLE} -m pip install ${PYPI_NAME}" + ) + else() + if(ARG_VERSION AND PKG_VERSION VERSION_LESS ARG_VERSION) + message( + ${status_level} + "Version incorrect: ${PYPI_NAME} ${PKG_VERSION} found, ${ARG_VERSION} required - try upgrading" + ) + else() + set(${NORM_PYPI_NAME}_FOUND + YES + CACHE INTERNAL "") + set(${NORM_PYPI_NAME}_VERSION + ${PKG_VERSION} + CACHE INTERNAL "") + endif() + if(NOT ARG_QUIET) + message(STATUS "Found ${PYPI_NAME} ${PKG_VERSION}") + endif() + endif() + if(NOT ARG_VERSION OR (NOT PKG_VERSION VERSION_LESS ARG_VERSION)) + # We have successfully found a good version, cache to avoid calling again. + endif() + endfunction() +endif() + +# --------------------- LTO ------------------------------- + +include(CheckCXXCompilerFlag) + +# Checks whether the given CXX/linker flags can compile and link a cxx file. +# cxxflags and linkerflags are lists of flags to use. The result variable is a +# unique variable name for each set of flags: the compilation result will be +# cached base on the result variable. If the flags work, sets them in +# cxxflags_out/linkerflags_out internal cache variables (in addition to +# ${result}). +function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out + linkerflags_out) + set(CMAKE_REQUIRED_LIBRARIES ${linkerflags}) + check_cxx_compiler_flag("${cxxflags}" ${result}) + if(${result}) + set(${cxxflags_out} + "${cxxflags}" + PARENT_SCOPE) + set(${linkerflags_out} + "${linkerflags}" + PARENT_SCOPE) + endif() +endfunction() + +function(_pybind11_generate_lto target prefer_thin_lto) + if(MINGW) + message(STATUS "${target} disabled (problems with undefined symbols for MinGW for now)") + return() + endif() + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + set(cxx_append "") + set(linker_append "") + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE) + # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it + set(linker_append ";$<$:-O3>") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND NOT MINGW) + set(cxx_append ";-fno-fat-lto-objects") + endif() + + if(prefer_thin_lto) + set(thin "=thin") + else() + set(thin "") + endif() + + if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le" OR CMAKE_SYSTEM_PROCESSOR MATCHES "mips64") + # Do nothing + elseif(CMAKE_SYSTEM_NAME MATCHES Emscripten) + # This compile is very costly when cross-compiling, so set this without checking + set(PYBIND11_LTO_CXX_FLAGS "-flto${thin}${cxx_append}") + set(PYBIND11_LTO_LINKER_FLAGS "-flto${thin}${linker_append}") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + _pybind11_return_if_cxx_and_linker_flags_work( + HAS_FLTO_THIN "-flto${thin}${cxx_append}" "-flto${thin}${linker_append}" + PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS) + endif() + if(NOT HAS_FLTO_THIN) + _pybind11_return_if_cxx_and_linker_flags_work( + HAS_FLTO "-flto${cxx_append}" "-flto${linker_append}" PYBIND11_LTO_CXX_FLAGS + PYBIND11_LTO_LINKER_FLAGS) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + # IntelLLVM equivalent to LTO is called IPO; also IntelLLVM is WIN32/UNIX + # WARNING/HELP WANTED: This block of code is currently not covered by pybind11 GitHub Actions! + if(WIN32) + _pybind11_return_if_cxx_and_linker_flags_work( + HAS_INTEL_IPO "-Qipo" "-Qipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS) + else() + _pybind11_return_if_cxx_and_linker_flags_work( + HAS_INTEL_IPO "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Intel equivalent to LTO is called IPO + _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO "-ipo" "-ipo" + PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS) + elseif(MSVC) + # cmake only interprets libraries as linker flags when they start with a - (otherwise it + # converts /LTCG to \LTCG as if it was a Windows path). Luckily MSVC supports passing flags + # with - instead of /, even if it is a bit non-standard: + _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG "/GL" "-LTCG" + PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS) + endif() + + # Enable LTO flags if found, except for Debug builds + if(PYBIND11_LTO_CXX_FLAGS) + # CONFIG takes multiple values in CMake 3.19+, until then we have to use OR + set(is_debug "$,$>") + set(not_debug "$") + set(cxx_lang "$") + if(MSVC AND CMAKE_VERSION VERSION_LESS 3.11) + set(genex "${not_debug}") + else() + set(genex "$") + endif() + set_property( + TARGET ${target} + APPEND + PROPERTY INTERFACE_COMPILE_OPTIONS "$<${genex}:${PYBIND11_LTO_CXX_FLAGS}>") + if(CMAKE_PROJECT_NAME STREQUAL "pybind11") + message(STATUS "${target} enabled") + endif() + else() + if(CMAKE_PROJECT_NAME STREQUAL "pybind11") + message(STATUS "${target} disabled (not supported by the compiler and/or linker)") + endif() + endif() + + if(PYBIND11_LTO_LINKER_FLAGS) + if(CMAKE_VERSION VERSION_LESS 3.11) + set_property( + TARGET ${target} + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>") + else() + set_property( + TARGET ${target} + APPEND + PROPERTY INTERFACE_LINK_OPTIONS "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>") + endif() + endif() +endfunction() + +if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION) + add_library(pybind11::lto IMPORTED INTERFACE ${optional_global}) + _pybind11_generate_lto(pybind11::lto FALSE) + + add_library(pybind11::thin_lto IMPORTED INTERFACE ${optional_global}) + _pybind11_generate_lto(pybind11::thin_lto TRUE) +endif() + +# ---------------------- pybind11_strip ----------------------------- + +function(pybind11_strip target_name) + # Strip unnecessary sections of the binary on Linux/macOS + if(CMAKE_STRIP) + if(APPLE) + set(x_opt -x) + endif() + + add_custom_command( + TARGET ${target_name} + POST_BUILD + COMMAND ${CMAKE_STRIP} ${x_opt} $) + endif() +endfunction()