Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +588 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1730 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +95 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +212 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +693 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h +63 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h +348 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h +659 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h +96 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGL.h +608 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h +123 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h +90 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h +280 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h +365 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h +109 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_device_runtime_api.h +889 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h +642 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h +367 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp +1750 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h +514 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h +1958 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h +224 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h +373 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h +148 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h +2374 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h +0 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_surface_types.h +76 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h +76 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h +201 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudart_platform.h +57 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.h +193 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp +254 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h +65 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h +65 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h +118 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h +81 -0
- .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h +145 -0
.gitattributes
CHANGED
|
@@ -119,3 +119,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
|
|
| 119 |
.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libgfortran-91cc3cb1.so.3.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 120 |
.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 121 |
.venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 119 |
.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libgfortran-91cc3cb1.so.3.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 120 |
.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 121 |
.venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libvpx-9f572e11.so.9.1.0 filter=lfs diff=lfs merge=lfs -text
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (200 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*******************************************************************************
|
| 51 |
+
* *
|
| 52 |
+
* *
|
| 53 |
+
* *
|
| 54 |
+
*******************************************************************************/
|
| 55 |
+
|
| 56 |
+
#include "device_types.h"
|
| 57 |
+
#if !defined(__CUDACC_RTC__)
|
| 58 |
+
#define EXCLUDE_FROM_RTC
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
#undef EXCLUDE_FROM_RTC
|
| 61 |
+
#endif /* !__CUDACC_RTC__ */
|
| 62 |
+
#include "surface_types.h"
|
| 63 |
+
#include "texture_types.h"
|
| 64 |
+
#include "vector_types.h"
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h
ADDED
|
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CHANNEL_DESCRIPTOR_H__)
|
| 51 |
+
#define __CHANNEL_DESCRIPTOR_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#include "cuda_runtime_api.h"
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \addtogroup CUDART_HIGHLEVEL
|
| 71 |
+
*
|
| 72 |
+
* @{
|
| 73 |
+
*/
|
| 74 |
+
|
| 75 |
+
/**
|
| 76 |
+
* \brief \hl Returns a channel descriptor using the specified format
|
| 77 |
+
*
|
| 78 |
+
* Returns a channel descriptor with format \p f and number of bits of each
|
| 79 |
+
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
|
| 80 |
+
* defined as:
|
| 81 |
+
* \code
|
| 82 |
+
struct cudaChannelFormatDesc {
|
| 83 |
+
int x, y, z, w;
|
| 84 |
+
enum cudaChannelFormatKind f;
|
| 85 |
+
};
|
| 86 |
+
* \endcode
|
| 87 |
+
*
|
| 88 |
+
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
|
| 89 |
+
* ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
|
| 90 |
+
* ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
|
| 91 |
+
* ::cudaChannelFormatKindSignedNormalized8X4,
|
| 92 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
|
| 93 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X4,
|
| 94 |
+
* ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
|
| 95 |
+
* ::cudaChannelFormatKindSignedNormalized16X4,
|
| 96 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
|
| 97 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X4
|
| 98 |
+
* or ::cudaChannelFormatKindNV12.
|
| 99 |
+
*
|
| 100 |
+
* The format is specified by the template specialization.
|
| 101 |
+
*
|
| 102 |
+
* The template function specializes for the following scalar types:
|
| 103 |
+
* char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
|
| 104 |
+
* The template function specializes for the following vector types:
|
| 105 |
+
* char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
|
| 106 |
+
* The template function specializes for following cudaChannelFormatKind enum values:
|
| 107 |
+
* ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
|
| 108 |
+
*
|
| 109 |
+
* Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
|
| 110 |
+
*
|
| 111 |
+
* \return
|
| 112 |
+
* Channel descriptor with format \p f
|
| 113 |
+
*
|
| 114 |
+
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
|
| 115 |
+
* ::cudaGetChannelDesc,
|
| 116 |
+
*/
|
| 117 |
+
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 118 |
+
{
|
| 119 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
|
| 123 |
+
{
|
| 124 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 125 |
+
|
| 126 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
|
| 130 |
+
{
|
| 131 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 132 |
+
|
| 133 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
|
| 137 |
+
{
|
| 138 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 139 |
+
|
| 140 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
|
| 144 |
+
{
|
| 145 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 146 |
+
|
| 147 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
|
| 151 |
+
{
|
| 152 |
+
int e = (int)sizeof(char) * 8;
|
| 153 |
+
|
| 154 |
+
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
|
| 155 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 156 |
+
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 157 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 158 |
+
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
|
| 162 |
+
{
|
| 163 |
+
int e = (int)sizeof(signed char) * 8;
|
| 164 |
+
|
| 165 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
|
| 169 |
+
{
|
| 170 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 171 |
+
|
| 172 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
|
| 176 |
+
{
|
| 177 |
+
int e = (int)sizeof(signed char) * 8;
|
| 178 |
+
|
| 179 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
|
| 183 |
+
{
|
| 184 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 185 |
+
|
| 186 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
|
| 190 |
+
{
|
| 191 |
+
int e = (int)sizeof(signed char) * 8;
|
| 192 |
+
|
| 193 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
|
| 197 |
+
{
|
| 198 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 199 |
+
|
| 200 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
|
| 204 |
+
{
|
| 205 |
+
int e = (int)sizeof(signed char) * 8;
|
| 206 |
+
|
| 207 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
|
| 211 |
+
{
|
| 212 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 213 |
+
|
| 214 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
|
| 218 |
+
{
|
| 219 |
+
int e = (int)sizeof(short) * 8;
|
| 220 |
+
|
| 221 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
|
| 225 |
+
{
|
| 226 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 227 |
+
|
| 228 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
|
| 232 |
+
{
|
| 233 |
+
int e = (int)sizeof(short) * 8;
|
| 234 |
+
|
| 235 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
|
| 239 |
+
{
|
| 240 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 241 |
+
|
| 242 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
|
| 246 |
+
{
|
| 247 |
+
int e = (int)sizeof(short) * 8;
|
| 248 |
+
|
| 249 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
|
| 253 |
+
{
|
| 254 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 255 |
+
|
| 256 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
|
| 260 |
+
{
|
| 261 |
+
int e = (int)sizeof(short) * 8;
|
| 262 |
+
|
| 263 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
|
| 267 |
+
{
|
| 268 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 269 |
+
|
| 270 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
|
| 274 |
+
{
|
| 275 |
+
int e = (int)sizeof(int) * 8;
|
| 276 |
+
|
| 277 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
|
| 281 |
+
{
|
| 282 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 283 |
+
|
| 284 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
|
| 288 |
+
{
|
| 289 |
+
int e = (int)sizeof(int) * 8;
|
| 290 |
+
|
| 291 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
|
| 295 |
+
{
|
| 296 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 297 |
+
|
| 298 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
|
| 302 |
+
{
|
| 303 |
+
int e = (int)sizeof(int) * 8;
|
| 304 |
+
|
| 305 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
|
| 309 |
+
{
|
| 310 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 311 |
+
|
| 312 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
|
| 316 |
+
{
|
| 317 |
+
int e = (int)sizeof(int) * 8;
|
| 318 |
+
|
| 319 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
|
| 323 |
+
{
|
| 324 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 325 |
+
|
| 326 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
#if !defined(__LP64__)
|
| 330 |
+
|
| 331 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
|
| 332 |
+
{
|
| 333 |
+
int e = (int)sizeof(long) * 8;
|
| 334 |
+
|
| 335 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
|
| 339 |
+
{
|
| 340 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 341 |
+
|
| 342 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
|
| 346 |
+
{
|
| 347 |
+
int e = (int)sizeof(long) * 8;
|
| 348 |
+
|
| 349 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
|
| 353 |
+
{
|
| 354 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 355 |
+
|
| 356 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
|
| 360 |
+
{
|
| 361 |
+
int e = (int)sizeof(long) * 8;
|
| 362 |
+
|
| 363 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
|
| 367 |
+
{
|
| 368 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 369 |
+
|
| 370 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
|
| 374 |
+
{
|
| 375 |
+
int e = (int)sizeof(long) * 8;
|
| 376 |
+
|
| 377 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
|
| 381 |
+
{
|
| 382 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 383 |
+
|
| 384 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
#endif /* !__LP64__ */
|
| 388 |
+
|
| 389 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
|
| 390 |
+
{
|
| 391 |
+
int e = (int)sizeof(float) * 8;
|
| 392 |
+
|
| 393 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
|
| 397 |
+
{
|
| 398 |
+
int e = (int)sizeof(float) * 8;
|
| 399 |
+
|
| 400 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
|
| 404 |
+
{
|
| 405 |
+
int e = (int)sizeof(float) * 8;
|
| 406 |
+
|
| 407 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
|
| 411 |
+
{
|
| 412 |
+
int e = (int)sizeof(float) * 8;
|
| 413 |
+
|
| 414 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
|
| 418 |
+
{
|
| 419 |
+
int e = (int)sizeof(char) * 8;
|
| 420 |
+
|
| 421 |
+
return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 425 |
+
{
|
| 426 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
/* Signed 8-bit normalized integer formats */
|
| 430 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
|
| 431 |
+
{
|
| 432 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
|
| 436 |
+
{
|
| 437 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
|
| 441 |
+
{
|
| 442 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/* Unsigned 8-bit normalized integer formats */
|
| 446 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
|
| 447 |
+
{
|
| 448 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
|
| 452 |
+
{
|
| 453 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
|
| 457 |
+
{
|
| 458 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
/* Signed 16-bit normalized integer formats */
|
| 462 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
|
| 463 |
+
{
|
| 464 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
|
| 468 |
+
{
|
| 469 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
|
| 473 |
+
{
|
| 474 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
/* Unsigned 16-bit normalized integer formats */
|
| 478 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
|
| 479 |
+
{
|
| 480 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
|
| 484 |
+
{
|
| 485 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
|
| 489 |
+
{
|
| 490 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
/* NV12 format */
|
| 494 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
|
| 495 |
+
{
|
| 496 |
+
return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
/* BC1 format */
|
| 500 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
|
| 501 |
+
{
|
| 502 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/* BC1sRGB format */
|
| 506 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
|
| 507 |
+
{
|
| 508 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
/* BC2 format */
|
| 512 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
|
| 513 |
+
{
|
| 514 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
/* BC2sRGB format */
|
| 518 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
|
| 519 |
+
{
|
| 520 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
/* BC3 format */
|
| 524 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
|
| 525 |
+
{
|
| 526 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
/* BC3sRGB format */
|
| 530 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
|
| 531 |
+
{
|
| 532 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
/* BC4 unsigned format */
|
| 536 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
|
| 537 |
+
{
|
| 538 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
/* BC4 signed format */
|
| 542 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
|
| 543 |
+
{
|
| 544 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
/* BC5 unsigned format */
|
| 548 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
|
| 549 |
+
{
|
| 550 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
/* BC5 signed format */
|
| 554 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
|
| 555 |
+
{
|
| 556 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
/* BC6H unsigned format */
|
| 560 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
|
| 561 |
+
{
|
| 562 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
/* BC6H signed format */
|
| 566 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
|
| 567 |
+
{
|
| 568 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
/* BC7 format */
|
| 572 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
|
| 573 |
+
{
|
| 574 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
/* BC7sRGB format */
|
| 578 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
|
| 579 |
+
{
|
| 580 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
#endif /* __cplusplus */
|
| 584 |
+
|
| 585 |
+
/** @} */
|
| 586 |
+
/** @} */ /* END CUDART_TEXTURE_HL */
|
| 587 |
+
|
| 588 |
+
#endif /* !__CHANNEL_DESCRIPTOR_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/common_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h
ADDED
|
@@ -0,0 +1,1730 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _COOPERATIVE_GROUPS_H_
|
| 51 |
+
#define _COOPERATIVE_GROUPS_H_
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
#include "cooperative_groups/details/info.h"
|
| 56 |
+
#include "cooperative_groups/details/driver_abi.h"
|
| 57 |
+
#include "cooperative_groups/details/helpers.h"
|
| 58 |
+
#include "cooperative_groups/details/memory.h"
|
| 59 |
+
|
| 60 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 61 |
+
#include <cuda/atomic>
|
| 62 |
+
#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
|
| 63 |
+
#else
|
| 64 |
+
#define _CG_THREAD_SCOPE(scope)
|
| 65 |
+
#endif
|
| 66 |
+
|
| 67 |
+
_CG_BEGIN_NAMESPACE
|
| 68 |
+
|
| 69 |
+
namespace details {
|
| 70 |
+
_CG_CONST_DECL unsigned int coalesced_group_id = 1;
|
| 71 |
+
_CG_CONST_DECL unsigned int multi_grid_group_id = 2;
|
| 72 |
+
_CG_CONST_DECL unsigned int grid_group_id = 3;
|
| 73 |
+
_CG_CONST_DECL unsigned int thread_block_id = 4;
|
| 74 |
+
_CG_CONST_DECL unsigned int multi_tile_group_id = 5;
|
| 75 |
+
_CG_CONST_DECL unsigned int cluster_group_id = 6;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* class thread_group;
|
| 80 |
+
*
|
| 81 |
+
* Generic thread group type, into which all groups are convertible.
|
| 82 |
+
* It acts as a container for all storage necessary for the derived groups,
|
| 83 |
+
* and will dispatch the API calls to the correct derived group. This means
|
| 84 |
+
* that all derived groups must implement the same interface as thread_group.
|
| 85 |
+
*/
|
| 86 |
+
class thread_group
|
| 87 |
+
{
|
| 88 |
+
protected:
|
| 89 |
+
struct group_data {
|
| 90 |
+
unsigned int _unused : 1;
|
| 91 |
+
unsigned int type : 7, : 0;
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
struct gg_data {
|
| 95 |
+
details::grid_workspace *gridWs;
|
| 96 |
+
};
|
| 97 |
+
|
| 98 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 99 |
+
struct mg_data {
|
| 100 |
+
unsigned long long _unused : 1;
|
| 101 |
+
unsigned long long type : 7;
|
| 102 |
+
unsigned long long handle : 56;
|
| 103 |
+
const details::multi_grid::multi_grid_functions *functions;
|
| 104 |
+
};
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
struct tg_data {
|
| 108 |
+
unsigned int is_tiled : 1;
|
| 109 |
+
unsigned int type : 7;
|
| 110 |
+
unsigned int size : 24;
|
| 111 |
+
// packed to 4b
|
| 112 |
+
unsigned int metaGroupSize : 16;
|
| 113 |
+
unsigned int metaGroupRank : 16;
|
| 114 |
+
// packed to 8b
|
| 115 |
+
unsigned int mask;
|
| 116 |
+
// packed to 12b
|
| 117 |
+
unsigned int _res;
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 121 |
+
friend class thread_block;
|
| 122 |
+
|
| 123 |
+
union __align__(8) {
|
| 124 |
+
group_data group;
|
| 125 |
+
tg_data coalesced;
|
| 126 |
+
gg_data grid;
|
| 127 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 128 |
+
mg_data multi_grid;
|
| 129 |
+
#endif
|
| 130 |
+
} _data;
|
| 131 |
+
|
| 132 |
+
_CG_QUALIFIER thread_group operator=(const thread_group& src);
|
| 133 |
+
|
| 134 |
+
_CG_QUALIFIER thread_group(unsigned int type) {
|
| 135 |
+
_data.group.type = type;
|
| 136 |
+
_data.group._unused = false;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
#ifdef _CG_CPP11_FEATURES
|
| 140 |
+
static_assert(sizeof(tg_data) <= 16, "Failed size check");
|
| 141 |
+
static_assert(sizeof(gg_data) <= 16, "Failed size check");
|
| 142 |
+
# ifdef _CG_ABI_EXPERIMENTAL
|
| 143 |
+
static_assert(sizeof(mg_data) <= 16, "Failed size check");
|
| 144 |
+
# endif
|
| 145 |
+
#endif
|
| 146 |
+
|
| 147 |
+
public:
|
| 148 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 149 |
+
|
| 150 |
+
_CG_QUALIFIER unsigned long long size() const;
|
| 151 |
+
_CG_QUALIFIER unsigned long long num_threads() const;
|
| 152 |
+
_CG_QUALIFIER unsigned long long thread_rank() const;
|
| 153 |
+
_CG_QUALIFIER void sync() const;
|
| 154 |
+
_CG_QUALIFIER unsigned int get_type() const {
|
| 155 |
+
return _data.group.type;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
};
|
| 159 |
+
|
| 160 |
+
template <unsigned int TyId>
|
| 161 |
+
struct thread_group_base : public thread_group {
|
| 162 |
+
_CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
|
| 163 |
+
_CG_STATIC_CONST_DECL unsigned int id = TyId;
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 167 |
+
|
| 168 |
+
/**
|
| 169 |
+
* class multi_grid_group;
|
| 170 |
+
*
|
| 171 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 172 |
+
* same system, on multiple devices within the same launched kernels.
|
| 173 |
+
* To use this group, the kernel must have been launched with
|
| 174 |
+
* cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
|
| 175 |
+
* and the device must support it (queryable device attribute).
|
| 176 |
+
*
|
| 177 |
+
* Constructed via this_multi_grid();
|
| 178 |
+
*/
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 182 |
+
class multi_grid_group;
|
| 183 |
+
|
| 184 |
+
// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
|
| 185 |
+
template <typename = void>
|
| 186 |
+
__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
|
| 187 |
+
|
| 188 |
+
class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
|
| 189 |
+
{
|
| 190 |
+
private:
|
| 191 |
+
template <typename = void>
|
| 192 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 193 |
+
_data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
|
| 194 |
+
_data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
friend multi_grid_group this_multi_grid<void>();
|
| 198 |
+
|
| 199 |
+
public:
|
| 200 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 201 |
+
|
| 202 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 203 |
+
return (_data.multi_grid.handle != 0);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
_CG_QUALIFIER void sync() const {
|
| 207 |
+
if (!is_valid()) {
|
| 208 |
+
_CG_ABORT();
|
| 209 |
+
}
|
| 210 |
+
_data.multi_grid.functions->sync(_data.multi_grid.handle);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
_CG_QUALIFIER unsigned long long num_threads() const {
|
| 214 |
+
_CG_ASSERT(is_valid());
|
| 215 |
+
return _data.multi_grid.functions->size(_data.multi_grid.handle);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
_CG_QUALIFIER unsigned long long size() const {
|
| 219 |
+
return num_threads();
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CG_QUALIFIER unsigned long long thread_rank() const {
|
| 223 |
+
_CG_ASSERT(is_valid());
|
| 224 |
+
return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
_CG_QUALIFIER unsigned int grid_rank() const {
|
| 228 |
+
_CG_ASSERT(is_valid());
|
| 229 |
+
return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
_CG_QUALIFIER unsigned int num_grids() const {
|
| 233 |
+
_CG_ASSERT(is_valid());
|
| 234 |
+
return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
|
| 235 |
+
}
|
| 236 |
+
};
|
| 237 |
+
# else
|
| 238 |
+
class multi_grid_group
|
| 239 |
+
{
|
| 240 |
+
private:
|
| 241 |
+
unsigned long long _handle;
|
| 242 |
+
unsigned int _size;
|
| 243 |
+
unsigned int _rank;
|
| 244 |
+
|
| 245 |
+
friend _CG_QUALIFIER multi_grid_group this_multi_grid();
|
| 246 |
+
|
| 247 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 248 |
+
_handle = details::multi_grid::get_intrinsic_handle();
|
| 249 |
+
_size = details::multi_grid::size(_handle);
|
| 250 |
+
_rank = details::multi_grid::thread_rank(_handle);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
public:
|
| 254 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 255 |
+
|
| 256 |
+
_CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
|
| 257 |
+
return (_handle != 0);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
_CG_QUALIFIER _CG_DEPRECATED void sync() const {
|
| 261 |
+
if (!is_valid()) {
|
| 262 |
+
_CG_ABORT();
|
| 263 |
+
}
|
| 264 |
+
details::multi_grid::sync(_handle);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
|
| 268 |
+
_CG_ASSERT(is_valid());
|
| 269 |
+
return _size;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
|
| 273 |
+
return num_threads();
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
|
| 277 |
+
_CG_ASSERT(is_valid());
|
| 278 |
+
return _rank;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
|
| 282 |
+
_CG_ASSERT(is_valid());
|
| 283 |
+
return (details::multi_grid::grid_rank(_handle));
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
|
| 287 |
+
_CG_ASSERT(is_valid());
|
| 288 |
+
return (details::multi_grid::num_grids(_handle));
|
| 289 |
+
}
|
| 290 |
+
};
|
| 291 |
+
# endif
|
| 292 |
+
|
| 293 |
+
/**
|
| 294 |
+
* multi_grid_group this_multi_grid()
|
| 295 |
+
*
|
| 296 |
+
* Constructs a multi_grid_group
|
| 297 |
+
*/
|
| 298 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 299 |
+
template <typename>
|
| 300 |
+
__device__
|
| 301 |
+
#else
|
| 302 |
+
_CG_QUALIFIER
|
| 303 |
+
# endif
|
| 304 |
+
_CG_DEPRECATED
|
| 305 |
+
multi_grid_group this_multi_grid()
|
| 306 |
+
{
|
| 307 |
+
return multi_grid_group();
|
| 308 |
+
}
|
| 309 |
+
#endif
|
| 310 |
+
|
| 311 |
+
/**
|
| 312 |
+
* class grid_group;
|
| 313 |
+
*
|
| 314 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 315 |
+
* same device within the same launched kernel. To use this group, the kernel
|
| 316 |
+
* must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
|
| 317 |
+
* and the device must support it (queryable device attribute).
|
| 318 |
+
*
|
| 319 |
+
* Constructed via this_grid();
|
| 320 |
+
*/
|
| 321 |
+
class grid_group : public thread_group_base<details::grid_group_id>
|
| 322 |
+
{
|
| 323 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
|
| 324 |
+
friend _CG_QUALIFIER grid_group this_grid();
|
| 325 |
+
|
| 326 |
+
private:
|
| 327 |
+
_CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
|
| 328 |
+
_data.grid.gridWs = gridWs;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
public:
|
| 332 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 333 |
+
|
| 334 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 335 |
+
return (_data.grid.gridWs != NULL);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
_CG_QUALIFIER void sync() const {
|
| 339 |
+
if (!is_valid()) {
|
| 340 |
+
_CG_ABORT();
|
| 341 |
+
}
|
| 342 |
+
details::grid::sync(&_data.grid.gridWs->barrier);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 346 |
+
using arrival_token = unsigned int;
|
| 347 |
+
|
| 348 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 349 |
+
if (!is_valid()) {
|
| 350 |
+
_CG_ABORT();
|
| 351 |
+
}
|
| 352 |
+
return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
|
| 356 |
+
details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
|
| 357 |
+
}
|
| 358 |
+
#endif
|
| 359 |
+
|
| 360 |
+
_CG_STATIC_QUALIFIER unsigned long long size() {
|
| 361 |
+
return details::grid::size();
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 365 |
+
return details::grid::grid_dim();
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 369 |
+
return details::grid::dim_threads();
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads() {
|
| 373 |
+
return details::grid::num_threads();
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 377 |
+
return details::grid::thread_index();
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank() {
|
| 381 |
+
return details::grid::thread_rank();
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks() {
|
| 385 |
+
return details::grid::dim_blocks();
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks() {
|
| 389 |
+
return details::grid::num_blocks();
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER dim3 block_index() {
|
| 393 |
+
return details::grid::block_index();
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank() {
|
| 397 |
+
return details::grid::block_rank();
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
# if defined(_CG_HAS_CLUSTER_GROUP)
|
| 401 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 402 |
+
return details::grid::dim_clusters();
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 406 |
+
return details::grid::num_clusters();
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 410 |
+
return details::grid::cluster_index();
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 414 |
+
return details::grid::cluster_rank();
|
| 415 |
+
}
|
| 416 |
+
# endif
|
| 417 |
+
};
|
| 418 |
+
|
| 419 |
+
_CG_QUALIFIER grid_group this_grid() {
|
| 420 |
+
// Load a workspace from the driver
|
| 421 |
+
grid_group gg(details::get_grid_workspace());
|
| 422 |
+
#ifdef _CG_DEBUG
|
| 423 |
+
// *all* threads must be available to synchronize
|
| 424 |
+
gg.sync();
|
| 425 |
+
#endif // _CG_DEBUG
|
| 426 |
+
return gg;
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
/**
|
| 431 |
+
* class cluster_group
|
| 432 |
+
*
|
| 433 |
+
* Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
|
| 434 |
+
* divided along all dimensions to form groups of blocks, each group of which is
|
| 435 |
+
* a block cluster. Clustered grids are subject to various restrictions and
|
| 436 |
+
* limitations. Primarily, a cluster consists of at most 8 blocks by default
|
| 437 |
+
* (although the user is allowed to opt-in to non-standard sizes,) and clustered
|
| 438 |
+
* grids are subject to additional occupancy limitations due to per-cluster
|
| 439 |
+
* hardware resource consumption. In exchange, a block cluster is guaranteed to
|
| 440 |
+
* be a cooperative group, with access to all cooperative group capabilities, as
|
| 441 |
+
* well as cluster specific capabilities and accelerations. A cluster_group
|
| 442 |
+
* represents a block cluster.
|
| 443 |
+
*
|
| 444 |
+
* Constructed via this_cluster_group();
|
| 445 |
+
*/
|
| 446 |
+
class cluster_group : public thread_group_base<details::cluster_group_id>
|
| 447 |
+
{
|
| 448 |
+
// Friends
|
| 449 |
+
friend _CG_QUALIFIER cluster_group this_cluster();
|
| 450 |
+
|
| 451 |
+
// Disable constructor
|
| 452 |
+
_CG_QUALIFIER cluster_group()
|
| 453 |
+
{
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
public:
|
| 457 |
+
//_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
|
| 458 |
+
|
| 459 |
+
using arrival_token = struct {};
|
| 460 |
+
|
| 461 |
+
// Functionality exposed by the group
|
| 462 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 463 |
+
{
|
| 464 |
+
return details::cluster::sync();
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
_CG_STATIC_QUALIFIER arrival_token barrier_arrive()
|
| 468 |
+
{
|
| 469 |
+
details::cluster::barrier_arrive();
|
| 470 |
+
return arrival_token();
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 474 |
+
{
|
| 475 |
+
return details::cluster::barrier_wait();
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
_CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
|
| 479 |
+
{
|
| 480 |
+
return details::cluster::barrier_wait();
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 484 |
+
{
|
| 485 |
+
return details::cluster::query_shared_rank(addr);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
template <typename T>
|
| 489 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 490 |
+
{
|
| 491 |
+
return details::cluster::map_shared_rank(addr, rank);
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 495 |
+
{
|
| 496 |
+
return details::cluster::block_index();
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 500 |
+
{
|
| 501 |
+
return details::cluster::block_rank();
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 505 |
+
{
|
| 506 |
+
return details::cluster::thread_index();
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 510 |
+
{
|
| 511 |
+
return details::cluster::thread_rank();
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 515 |
+
{
|
| 516 |
+
return details::cluster::dim_blocks();
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 520 |
+
{
|
| 521 |
+
return details::cluster::num_blocks();
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 525 |
+
{
|
| 526 |
+
return details::cluster::dim_threads();
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 530 |
+
{
|
| 531 |
+
return details::cluster::num_threads();
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
// Legacy aliases
|
| 535 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 536 |
+
{
|
| 537 |
+
return num_threads();
|
| 538 |
+
}
|
| 539 |
+
};
|
| 540 |
+
|
| 541 |
+
/*
|
| 542 |
+
* cluster_group this_cluster()
|
| 543 |
+
*
|
| 544 |
+
* Constructs a cluster_group
|
| 545 |
+
*/
|
| 546 |
+
_CG_QUALIFIER cluster_group this_cluster()
|
| 547 |
+
{
|
| 548 |
+
cluster_group cg;
|
| 549 |
+
#ifdef _CG_DEBUG
|
| 550 |
+
cg.sync();
|
| 551 |
+
#endif
|
| 552 |
+
return cg;
|
| 553 |
+
}
|
| 554 |
+
#endif
|
| 555 |
+
|
| 556 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 557 |
+
class thread_block;
|
| 558 |
+
template <unsigned int MaxBlockSize>
|
| 559 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 560 |
+
#endif
|
| 561 |
+
|
| 562 |
+
/**
|
| 563 |
+
* class thread_block
|
| 564 |
+
*
|
| 565 |
+
* Every GPU kernel is executed by a grid of thread blocks, and threads within
|
| 566 |
+
* each block are guaranteed to reside on the same streaming multiprocessor.
|
| 567 |
+
* A thread_block represents a thread block whose dimensions are not known until runtime.
|
| 568 |
+
*
|
| 569 |
+
* Constructed via this_thread_block();
|
| 570 |
+
*/
|
| 571 |
+
class thread_block : public thread_group_base<details::thread_block_id>
|
| 572 |
+
{
|
| 573 |
+
// Friends
|
| 574 |
+
friend _CG_QUALIFIER thread_block this_thread_block();
|
| 575 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 576 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
|
| 577 |
+
|
| 578 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 579 |
+
template <unsigned int MaxBlockSize>
|
| 580 |
+
friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 581 |
+
template <unsigned int Size>
|
| 582 |
+
friend class __static_size_multi_warp_tile_base;
|
| 583 |
+
|
| 584 |
+
details::multi_warp_scratch* const tile_memory;
|
| 585 |
+
|
| 586 |
+
template <unsigned int MaxBlockSize>
|
| 587 |
+
_CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
|
| 588 |
+
tile_memory(details::get_scratch_ptr(&scratch)) {
|
| 589 |
+
#ifdef _CG_DEBUG
|
| 590 |
+
if (num_threads() > MaxBlockSize) {
|
| 591 |
+
details::abort();
|
| 592 |
+
}
|
| 593 |
+
#endif
|
| 594 |
+
#if !defined(_CG_HAS_RESERVED_SHARED)
|
| 595 |
+
tile_memory->init_barriers(thread_rank());
|
| 596 |
+
sync();
|
| 597 |
+
#endif
|
| 598 |
+
}
|
| 599 |
+
#endif
|
| 600 |
+
|
| 601 |
+
// Disable constructor
|
| 602 |
+
_CG_QUALIFIER thread_block()
|
| 603 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 604 |
+
: tile_memory(details::get_scratch_ptr(NULL))
|
| 605 |
+
#endif
|
| 606 |
+
{ }
|
| 607 |
+
|
| 608 |
+
// Internal Use
|
| 609 |
+
_CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
|
| 610 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 611 |
+
|
| 612 |
+
// Invalid, immediately fail
|
| 613 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 614 |
+
details::abort();
|
| 615 |
+
return (thread_block());
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
unsigned int mask;
|
| 619 |
+
unsigned int base_offset = thread_rank() & (~(tilesz - 1));
|
| 620 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 621 |
+
|
| 622 |
+
mask = (unsigned int)(-1) >> (32 - masklength);
|
| 623 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 624 |
+
thread_group tile = thread_group(details::coalesced_group_id);
|
| 625 |
+
tile._data.coalesced.mask = mask;
|
| 626 |
+
tile._data.coalesced.size = __popc(mask);
|
| 627 |
+
tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
|
| 628 |
+
tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
|
| 629 |
+
tile._data.coalesced.is_tiled = true;
|
| 630 |
+
return (tile);
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
public:
|
| 634 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
|
| 635 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 636 |
+
|
| 637 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 638 |
+
details::cta::sync();
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 642 |
+
struct arrival_token {};
|
| 643 |
+
|
| 644 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 645 |
+
return arrival_token();
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&&) const {
|
| 649 |
+
details::cta::sync();
|
| 650 |
+
}
|
| 651 |
+
#endif
|
| 652 |
+
|
| 653 |
+
_CG_STATIC_QUALIFIER unsigned int size() {
|
| 654 |
+
return details::cta::size();
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 658 |
+
return details::cta::thread_rank();
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
// Additional functionality exposed by the group
|
| 662 |
+
_CG_STATIC_QUALIFIER dim3 group_index() {
|
| 663 |
+
return details::cta::group_index();
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 667 |
+
return details::cta::thread_index();
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 671 |
+
return details::cta::block_dim();
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 675 |
+
return details::cta::dim_threads();
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads() {
|
| 679 |
+
return details::cta::num_threads();
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
};
|
| 683 |
+
|
| 684 |
+
/**
|
| 685 |
+
* thread_block this_thread_block()
|
| 686 |
+
*
|
| 687 |
+
* Constructs a thread_block group
|
| 688 |
+
*/
|
| 689 |
+
_CG_QUALIFIER thread_block this_thread_block()
|
| 690 |
+
{
|
| 691 |
+
return (thread_block());
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 695 |
+
template <unsigned int MaxBlockSize>
|
| 696 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
|
| 697 |
+
return (thread_block(scratch));
|
| 698 |
+
}
|
| 699 |
+
#endif
|
| 700 |
+
|
| 701 |
+
/**
|
| 702 |
+
* class coalesced_group
|
| 703 |
+
*
|
| 704 |
+
* A group representing the current set of converged threads in a warp.
|
| 705 |
+
* The size of the group is not guaranteed and it may return a group of
|
| 706 |
+
* only one thread (itself).
|
| 707 |
+
*
|
| 708 |
+
* This group exposes warp-synchronous builtins.
|
| 709 |
+
* Constructed via coalesced_threads();
|
| 710 |
+
*/
|
| 711 |
+
class coalesced_group : public thread_group_base<details::coalesced_group_id>
|
| 712 |
+
{
|
| 713 |
+
private:
|
| 714 |
+
friend _CG_QUALIFIER coalesced_group coalesced_threads();
|
| 715 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 716 |
+
friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
|
| 717 |
+
friend class details::_coalesced_group_data_access;
|
| 718 |
+
|
| 719 |
+
_CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
|
| 720 |
+
unsigned int member_pack = 0;
|
| 721 |
+
unsigned int member_rank = 0;
|
| 722 |
+
for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 723 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 724 |
+
if (lane_bit) {
|
| 725 |
+
if (laneMask & lane_bit)
|
| 726 |
+
member_pack |= 1 << member_rank;
|
| 727 |
+
member_rank++;
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
return (member_pack);
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
// Internal Use
|
| 734 |
+
_CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
|
| 735 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 736 |
+
|
| 737 |
+
// Invalid, immediately fail
|
| 738 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 739 |
+
details::abort();
|
| 740 |
+
return (coalesced_group(0));
|
| 741 |
+
}
|
| 742 |
+
if (size() <= tilesz) {
|
| 743 |
+
return (*this);
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
|
| 747 |
+
unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
|
| 748 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 749 |
+
unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
|
| 750 |
+
|
| 751 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 752 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 753 |
+
coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
|
| 754 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 755 |
+
coalesced_tile._data.coalesced.is_tiled = true;
|
| 756 |
+
return (coalesced_tile);
|
| 757 |
+
}
|
| 758 |
+
else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
|
| 759 |
+
unsigned int mask = 0;
|
| 760 |
+
unsigned int member_rank = 0;
|
| 761 |
+
int seen_lanes = (thread_rank() / tilesz) * tilesz;
|
| 762 |
+
for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 763 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 764 |
+
if (lane_bit) {
|
| 765 |
+
if (seen_lanes <= 0 && member_rank < tilesz) {
|
| 766 |
+
mask |= lane_bit;
|
| 767 |
+
member_rank++;
|
| 768 |
+
}
|
| 769 |
+
seen_lanes--;
|
| 770 |
+
}
|
| 771 |
+
}
|
| 772 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 773 |
+
// Override parent with the size of this group
|
| 774 |
+
coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
|
| 775 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 776 |
+
return coalesced_tile;
|
| 777 |
+
}
|
| 778 |
+
else {
|
| 779 |
+
// None in _CG_VERSION 1000
|
| 780 |
+
details::abort();
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
return (coalesced_group(0));
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
protected:
|
| 787 |
+
_CG_QUALIFIER coalesced_group(unsigned int mask) {
|
| 788 |
+
_data.coalesced.mask = mask;
|
| 789 |
+
_data.coalesced.size = __popc(mask);
|
| 790 |
+
_data.coalesced.metaGroupRank = 0;
|
| 791 |
+
_data.coalesced.metaGroupSize = 1;
|
| 792 |
+
_data.coalesced.is_tiled = false;
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 796 |
+
return (_data.coalesced.mask);
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
public:
|
| 800 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 801 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 802 |
+
|
| 803 |
+
_CG_QUALIFIER unsigned int num_threads() const {
|
| 804 |
+
return _data.coalesced.size;
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
_CG_QUALIFIER unsigned int size() const {
|
| 808 |
+
return num_threads();
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
_CG_QUALIFIER unsigned int thread_rank() const {
|
| 812 |
+
return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
// Rank of this group in the upper level of the hierarchy
|
| 816 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 817 |
+
return _data.coalesced.metaGroupRank;
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 821 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 822 |
+
return _data.coalesced.metaGroupSize;
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
_CG_QUALIFIER void sync() const {
|
| 826 |
+
__syncwarp(_data.coalesced.mask);
|
| 827 |
+
}
|
| 828 |
+
|
| 829 |
+
#ifdef _CG_CPP11_FEATURES
|
| 830 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 831 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 832 |
+
unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 833 |
+
(size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
|
| 834 |
+
|
| 835 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 836 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 840 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 841 |
+
if (size() == 32) {
|
| 842 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 843 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 844 |
+
}
|
| 845 |
+
|
| 846 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 847 |
+
|
| 848 |
+
if (lane >= 32)
|
| 849 |
+
lane = details::laneid();
|
| 850 |
+
|
| 851 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 852 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 856 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
|
| 857 |
+
if (size() == 32) {
|
| 858 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 859 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 863 |
+
if (lane >= 32)
|
| 864 |
+
lane = details::laneid();
|
| 865 |
+
|
| 866 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 867 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 868 |
+
}
|
| 869 |
+
#else
|
| 870 |
+
template <typename TyIntegral>
|
| 871 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
|
| 872 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 873 |
+
unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 874 |
+
(size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
|
| 875 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 876 |
+
}
|
| 877 |
+
|
| 878 |
+
template <typename TyIntegral>
|
| 879 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
|
| 880 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 881 |
+
if (size() == 32) {
|
| 882 |
+
return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
|
| 883 |
+
}
|
| 884 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 885 |
+
if (lane >= 32) lane = details::laneid();
|
| 886 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
template <typename TyIntegral>
|
| 890 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
|
| 891 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 892 |
+
if (size() == 32) {
|
| 893 |
+
return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
|
| 894 |
+
}
|
| 895 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 896 |
+
if (lane >= 32) lane = details::laneid();
|
| 897 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 898 |
+
}
|
| 899 |
+
#endif
|
| 900 |
+
|
| 901 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 902 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
|
| 903 |
+
}
|
| 904 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 905 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
|
| 906 |
+
}
|
| 907 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 908 |
+
if (size() == 32) {
|
| 909 |
+
return (__ballot_sync(0xFFFFFFFF, predicate));
|
| 910 |
+
}
|
| 911 |
+
unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
|
| 912 |
+
return (_packLanes(lane_ballot));
|
| 913 |
+
}
|
| 914 |
+
|
| 915 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 916 |
+
|
| 917 |
+
template <typename TyIntegral>
|
| 918 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 919 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 920 |
+
if (size() == 32) {
|
| 921 |
+
return (__match_any_sync(0xFFFFFFFF, val));
|
| 922 |
+
}
|
| 923 |
+
unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
|
| 924 |
+
return (_packLanes(lane_match));
|
| 925 |
+
}
|
| 926 |
+
|
| 927 |
+
template <typename TyIntegral>
|
| 928 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 929 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 930 |
+
if (size() == 32) {
|
| 931 |
+
return (__match_all_sync(0xFFFFFFFF, val, &pred));
|
| 932 |
+
}
|
| 933 |
+
unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
|
| 934 |
+
return (_packLanes(lane_match));
|
| 935 |
+
}
|
| 936 |
+
|
| 937 |
+
#endif /* !_CG_HAS_MATCH_COLLECTIVE */
|
| 938 |
+
|
| 939 |
+
};
|
| 940 |
+
|
| 941 |
+
_CG_QUALIFIER coalesced_group coalesced_threads()
|
| 942 |
+
{
|
| 943 |
+
return (coalesced_group(__activemask()));
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
namespace details {
|
| 947 |
+
template <unsigned int Size> struct verify_thread_block_tile_size;
|
| 948 |
+
template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
|
| 949 |
+
template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
|
| 950 |
+
template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
|
| 951 |
+
template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
|
| 952 |
+
template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
|
| 953 |
+
template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
|
| 954 |
+
|
| 955 |
+
#ifdef _CG_CPP11_FEATURES
|
| 956 |
+
template <unsigned int Size>
|
| 957 |
+
using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
|
| 958 |
+
|
| 959 |
+
template <unsigned int Size>
|
| 960 |
+
using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
|
| 961 |
+
template <unsigned int Size>
|
| 962 |
+
using _is_multi_warp =
|
| 963 |
+
_CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
|
| 964 |
+
|
| 965 |
+
template <unsigned int Size>
|
| 966 |
+
using _is_valid_single_warp_tile =
|
| 967 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
|
| 968 |
+
template <unsigned int Size>
|
| 969 |
+
using _is_valid_multi_warp_tile =
|
| 970 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
|
| 971 |
+
#else
|
| 972 |
+
template <unsigned int Size>
|
| 973 |
+
struct _is_multi_warp {
|
| 974 |
+
static const bool value = false;
|
| 975 |
+
};
|
| 976 |
+
#endif
|
| 977 |
+
}
|
| 978 |
+
|
| 979 |
+
template <unsigned int Size>
|
| 980 |
+
class __static_size_tile_base
|
| 981 |
+
{
|
| 982 |
+
protected:
|
| 983 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 984 |
+
|
| 985 |
+
public:
|
| 986 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 987 |
+
|
| 988 |
+
// Rank of thread within tile
|
| 989 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 990 |
+
return (details::cta::thread_rank() & (numThreads - 1));
|
| 991 |
+
}
|
| 992 |
+
|
| 993 |
+
// Number of threads within tile
|
| 994 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
|
| 995 |
+
return numThreads;
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
|
| 999 |
+
return num_threads();
|
| 1000 |
+
}
|
| 1001 |
+
};
|
| 1002 |
+
|
| 1003 |
+
template <unsigned int Size>
|
| 1004 |
+
class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
|
| 1005 |
+
{
|
| 1006 |
+
friend class details::_coalesced_group_data_access;
|
| 1007 |
+
typedef details::tile::tile_helpers<Size> th;
|
| 1008 |
+
|
| 1009 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1010 |
+
static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
|
| 1011 |
+
#else
|
| 1012 |
+
typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
|
| 1013 |
+
#endif
|
| 1014 |
+
using __static_size_tile_base<Size>::numThreads;
|
| 1015 |
+
_CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
|
| 1016 |
+
|
| 1017 |
+
protected:
|
| 1018 |
+
_CG_STATIC_QUALIFIER unsigned int build_mask() {
|
| 1019 |
+
unsigned int mask = fullMask;
|
| 1020 |
+
if (numThreads != 32) {
|
| 1021 |
+
// [0,31] representing the current active thread in the warp
|
| 1022 |
+
unsigned int laneId = details::laneid();
|
| 1023 |
+
// shift mask according to the partition it belongs to
|
| 1024 |
+
mask = th::tileMask << (laneId & ~(th::laneMask));
|
| 1025 |
+
}
|
| 1026 |
+
return (mask);
|
| 1027 |
+
}
|
| 1028 |
+
|
| 1029 |
+
public:
|
| 1030 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 1031 |
+
|
| 1032 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 1033 |
+
__syncwarp(build_mask());
|
| 1034 |
+
}
|
| 1035 |
+
|
| 1036 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1037 |
+
// PTX supported collectives
|
| 1038 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1039 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 1040 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 1041 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
|
| 1042 |
+
}
|
| 1043 |
+
|
| 1044 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1045 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 1046 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 1047 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1048 |
+
}
|
| 1049 |
+
|
| 1050 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1051 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
|
| 1052 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 1053 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1054 |
+
}
|
| 1055 |
+
|
| 1056 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1057 |
+
_CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
|
| 1058 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
|
| 1059 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
|
| 1060 |
+
}
|
| 1061 |
+
#else
|
| 1062 |
+
template <typename TyIntegral>
|
| 1063 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
|
| 1064 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1065 |
+
return (__shfl_sync(build_mask(), var, srcRank, numThreads));
|
| 1066 |
+
}
|
| 1067 |
+
|
| 1068 |
+
template <typename TyIntegral>
|
| 1069 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
|
| 1070 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1071 |
+
return (__shfl_down_sync(build_mask(), var, delta, numThreads));
|
| 1072 |
+
}
|
| 1073 |
+
|
| 1074 |
+
template <typename TyIntegral>
|
| 1075 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
|
| 1076 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1077 |
+
return (__shfl_up_sync(build_mask(), var, delta, numThreads));
|
| 1078 |
+
}
|
| 1079 |
+
|
| 1080 |
+
template <typename TyIntegral>
|
| 1081 |
+
_CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
|
| 1082 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1083 |
+
return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
|
| 1084 |
+
}
|
| 1085 |
+
#endif //_CG_CPP11_FEATURES
|
| 1086 |
+
|
| 1087 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1088 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1089 |
+
return (lane_ballot != 0);
|
| 1090 |
+
}
|
| 1091 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1092 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1093 |
+
return (lane_ballot == build_mask());
|
| 1094 |
+
}
|
| 1095 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 1096 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1097 |
+
return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
|
| 1098 |
+
}
|
| 1099 |
+
|
| 1100 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 1101 |
+
template <typename TyIntegral>
|
| 1102 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 1103 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1104 |
+
unsigned int lane_match = __match_any_sync(build_mask(), val);
|
| 1105 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1106 |
+
}
|
| 1107 |
+
|
| 1108 |
+
template <typename TyIntegral>
|
| 1109 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 1110 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1111 |
+
unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
|
| 1112 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1113 |
+
}
|
| 1114 |
+
#endif
|
| 1115 |
+
|
| 1116 |
+
};
|
| 1117 |
+
|
| 1118 |
+
template <unsigned int Size, typename ParentT>
|
| 1119 |
+
class __static_parent_thread_block_tile_base
|
| 1120 |
+
{
|
| 1121 |
+
public:
|
| 1122 |
+
// Rank of this group in the upper level of the hierarchy
|
| 1123 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
|
| 1124 |
+
return ParentT::thread_rank() / Size;
|
| 1125 |
+
}
|
| 1126 |
+
|
| 1127 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 1128 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_size() {
|
| 1129 |
+
return (ParentT::size() + Size - 1) / Size;
|
| 1130 |
+
}
|
| 1131 |
+
};
|
| 1132 |
+
|
| 1133 |
+
/**
|
| 1134 |
+
* class thread_block_tile<unsigned int Size, ParentT = void>
|
| 1135 |
+
*
|
| 1136 |
+
* Statically-sized group type, representing one tile of a thread block.
|
| 1137 |
+
* The only specializations currently supported are those with native
|
| 1138 |
+
* hardware support (1/2/4/8/16/32)
|
| 1139 |
+
*
|
| 1140 |
+
* This group exposes warp-synchronous builtins.
|
| 1141 |
+
* Can only be constructed via tiled_partition<Size>(ParentT&)
|
| 1142 |
+
*/
|
| 1143 |
+
|
| 1144 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1145 |
+
class __single_warp_thread_block_tile :
|
| 1146 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1147 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1148 |
+
{
|
| 1149 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1150 |
+
friend class details::_coalesced_group_data_access;
|
| 1151 |
+
|
| 1152 |
+
protected:
|
| 1153 |
+
_CG_QUALIFIER __single_warp_thread_block_tile() { };
|
| 1154 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
|
| 1155 |
+
|
| 1156 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask() {
|
| 1157 |
+
return __static_size_thread_block_tile_base<Size>::build_mask();
|
| 1158 |
+
}
|
| 1159 |
+
};
|
| 1160 |
+
|
| 1161 |
+
template <unsigned int Size>
|
| 1162 |
+
class __single_warp_thread_block_tile<Size, void> :
|
| 1163 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1164 |
+
public thread_group_base<details::coalesced_group_id>
|
| 1165 |
+
{
|
| 1166 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 1167 |
+
|
| 1168 |
+
template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
|
| 1169 |
+
friend class details::_coalesced_group_data_access;
|
| 1170 |
+
|
| 1171 |
+
typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
|
| 1172 |
+
|
| 1173 |
+
protected:
|
| 1174 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
|
| 1175 |
+
_data.coalesced.mask = staticSizeBaseT::build_mask();
|
| 1176 |
+
_data.coalesced.size = numThreads;
|
| 1177 |
+
_data.coalesced.metaGroupRank = meta_group_rank;
|
| 1178 |
+
_data.coalesced.metaGroupSize = meta_group_size;
|
| 1179 |
+
_data.coalesced.is_tiled = true;
|
| 1180 |
+
}
|
| 1181 |
+
|
| 1182 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 1183 |
+
return (_data.coalesced.mask);
|
| 1184 |
+
}
|
| 1185 |
+
|
| 1186 |
+
public:
|
| 1187 |
+
using staticSizeBaseT::sync;
|
| 1188 |
+
using staticSizeBaseT::size;
|
| 1189 |
+
using staticSizeBaseT::num_threads;
|
| 1190 |
+
using staticSizeBaseT::thread_rank;
|
| 1191 |
+
|
| 1192 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1193 |
+
return _data.coalesced.metaGroupRank;
|
| 1194 |
+
}
|
| 1195 |
+
|
| 1196 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1197 |
+
return _data.coalesced.metaGroupSize;
|
| 1198 |
+
}
|
| 1199 |
+
};
|
| 1200 |
+
|
| 1201 |
+
/**
|
| 1202 |
+
* Outer level API calls
|
| 1203 |
+
* void sync(GroupT) - see <group_type>.sync()
|
| 1204 |
+
* void thread_rank(GroupT) - see <group_type>.thread_rank()
|
| 1205 |
+
* void group_size(GroupT) - see <group_type>.size()
|
| 1206 |
+
*/
|
| 1207 |
+
template <class GroupT>
|
| 1208 |
+
_CG_QUALIFIER void sync(GroupT const &g)
|
| 1209 |
+
{
|
| 1210 |
+
g.sync();
|
| 1211 |
+
}
|
| 1212 |
+
|
| 1213 |
+
// TODO: Use a static dispatch to determine appropriate return type
|
| 1214 |
+
// C++03 is stuck with unsigned long long for now
|
| 1215 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1216 |
+
template <class GroupT>
|
| 1217 |
+
_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
|
| 1218 |
+
return g.thread_rank();
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
|
| 1222 |
+
template <class GroupT>
|
| 1223 |
+
_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
|
| 1224 |
+
return g.num_threads();
|
| 1225 |
+
}
|
| 1226 |
+
#else
|
| 1227 |
+
template <class GroupT>
|
| 1228 |
+
_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
|
| 1229 |
+
return static_cast<unsigned long long>(g.thread_rank());
|
| 1230 |
+
}
|
| 1231 |
+
|
| 1232 |
+
|
| 1233 |
+
template <class GroupT>
|
| 1234 |
+
_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
|
| 1235 |
+
return static_cast<unsigned long long>(g.num_threads());
|
| 1236 |
+
}
|
| 1237 |
+
#endif
|
| 1238 |
+
|
| 1239 |
+
|
| 1240 |
+
/**
|
| 1241 |
+
* tiled_partition
|
| 1242 |
+
*
|
| 1243 |
+
* The tiled_partition(parent, tilesz) method is a collective operation that
|
| 1244 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1245 |
+
*
|
| 1246 |
+
* A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
|
| 1247 |
+
* be created where threads having identical k = (thread_rank(parent)/tilesz)
|
| 1248 |
+
* will be members of the same subgroup.
|
| 1249 |
+
*
|
| 1250 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1251 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1252 |
+
*
|
| 1253 |
+
* Functionality is limited to power-of-two sized subgorup instances of at most
|
| 1254 |
+
* 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
|
| 1255 |
+
* tiled_partition() in _CG_VERSION 1000.
|
| 1256 |
+
*/
|
| 1257 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
|
| 1258 |
+
{
|
| 1259 |
+
if (parent.get_type() == details::coalesced_group_id) {
|
| 1260 |
+
const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
|
| 1261 |
+
return _cg->_get_tiled_threads(tilesz);
|
| 1262 |
+
}
|
| 1263 |
+
else {
|
| 1264 |
+
const thread_block *_tb = static_cast<const thread_block*>(&parent);
|
| 1265 |
+
return _tb->_get_tiled_threads(tilesz);
|
| 1266 |
+
}
|
| 1267 |
+
}
|
| 1268 |
+
|
| 1269 |
+
// Thread block type overload: returns a basic thread_group for now (may be specialized later)
|
| 1270 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
|
| 1271 |
+
{
|
| 1272 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1273 |
+
}
|
| 1274 |
+
|
| 1275 |
+
// Coalesced group type overload: retains its ability to stay coalesced
|
| 1276 |
+
_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
|
| 1277 |
+
{
|
| 1278 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1279 |
+
}
|
| 1280 |
+
|
| 1281 |
+
namespace details {
|
| 1282 |
+
template <unsigned int Size, typename ParentT>
|
| 1283 |
+
class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
|
| 1284 |
+
|
| 1285 |
+
template <unsigned int Size, typename ParentT>
|
| 1286 |
+
_CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
|
| 1287 |
+
return internal_thread_block_tile<Size, ParentT>();
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1291 |
+
_CG_QUALIFIER TyVal multi_warp_collectives_helper(
|
| 1292 |
+
const GroupT& group,
|
| 1293 |
+
WarpLambda warp_lambda,
|
| 1294 |
+
InterWarpLambda inter_warp_lambda) {
|
| 1295 |
+
return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
|
| 1296 |
+
}
|
| 1297 |
+
|
| 1298 |
+
template <typename T, typename GroupT>
|
| 1299 |
+
_CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
|
| 1300 |
+
return group.template get_scratch_location<T>(warp_id);
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
template <typename GroupT>
|
| 1304 |
+
_CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
|
| 1305 |
+
return group.get_sync_location();
|
| 1306 |
+
}
|
| 1307 |
+
|
| 1308 |
+
}
|
| 1309 |
+
/**
|
| 1310 |
+
* tiled_partition<tilesz>
|
| 1311 |
+
*
|
| 1312 |
+
* The tiled_partition<tilesz>(parent) method is a collective operation that
|
| 1313 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1314 |
+
*
|
| 1315 |
+
* A total of ((size(parent)/tilesz) subgroups will be created,
|
| 1316 |
+
* therefore the parent group size must be evenly divisible by the tilesz.
|
| 1317 |
+
* The allow parent groups are thread_block or thread_block_tile<size>.
|
| 1318 |
+
*
|
| 1319 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1320 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1321 |
+
*
|
| 1322 |
+
* Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
|
| 1323 |
+
* The size(parent) must be greater than the template Size parameter
|
| 1324 |
+
* otherwise the results are undefined.
|
| 1325 |
+
*/
|
| 1326 |
+
|
| 1327 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1328 |
+
template <unsigned int Size>
|
| 1329 |
+
class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
|
| 1330 |
+
{
|
| 1331 |
+
static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
|
| 1332 |
+
|
| 1333 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1334 |
+
friend __device__ TyVal details::multi_warp_collectives_helper(
|
| 1335 |
+
const GroupT& group,
|
| 1336 |
+
WarpLambda warp_lambda,
|
| 1337 |
+
InterWarpLambda inter_warp_lambda);
|
| 1338 |
+
template <typename T, typename GroupT>
|
| 1339 |
+
friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
|
| 1340 |
+
template <typename GroupT>
|
| 1341 |
+
friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
|
| 1342 |
+
template <unsigned int OtherSize>
|
| 1343 |
+
friend class __static_size_multi_warp_tile_base;
|
| 1344 |
+
using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 1345 |
+
using ThisType = __static_size_multi_warp_tile_base<Size>;
|
| 1346 |
+
_CG_STATIC_CONST_DECL int numWarps = Size / 32;
|
| 1347 |
+
|
| 1348 |
+
protected:
|
| 1349 |
+
details::multi_warp_scratch* const tile_memory;
|
| 1350 |
+
|
| 1351 |
+
template <typename GroupT>
|
| 1352 |
+
_CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
|
| 1353 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 1354 |
+
details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
|
| 1355 |
+
g.sync();
|
| 1356 |
+
#endif
|
| 1357 |
+
}
|
| 1358 |
+
|
| 1359 |
+
|
| 1360 |
+
private:
|
| 1361 |
+
_CG_QUALIFIER details::barrier_t* get_sync_location() const {
|
| 1362 |
+
// Different group sizes use different barriers, all groups of a given size share one barrier.
|
| 1363 |
+
unsigned int sync_id = details::log2(Size / 64);
|
| 1364 |
+
return &tile_memory->barriers[sync_id];
|
| 1365 |
+
}
|
| 1366 |
+
|
| 1367 |
+
template <typename T>
|
| 1368 |
+
_CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
|
| 1369 |
+
unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
|
| 1370 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1371 |
+
}
|
| 1372 |
+
|
| 1373 |
+
template <typename T>
|
| 1374 |
+
_CG_QUALIFIER T* get_scratch_location() const {
|
| 1375 |
+
unsigned int scratch_id = details::cta::thread_rank() / 32;
|
| 1376 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1377 |
+
}
|
| 1378 |
+
|
| 1379 |
+
template <typename TyVal>
|
| 1380 |
+
_CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
|
| 1381 |
+
unsigned int src_warp = src / 32;
|
| 1382 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1383 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1384 |
+
|
| 1385 |
+
// Get warp slot of the source threads warp.
|
| 1386 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
|
| 1387 |
+
|
| 1388 |
+
if (warp.meta_group_rank() == src_warp) {
|
| 1389 |
+
warp.sync();
|
| 1390 |
+
// Put shuffled value into my warp slot and let my warp arrive at the barrier.
|
| 1391 |
+
if (thread_rank() == src) {
|
| 1392 |
+
*warp_scratch_location = val;
|
| 1393 |
+
}
|
| 1394 |
+
details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
|
| 1395 |
+
TyVal result = *warp_scratch_location;
|
| 1396 |
+
details::sync_warps_wait(sync_location, details::cta::thread_rank());
|
| 1397 |
+
return result;
|
| 1398 |
+
}
|
| 1399 |
+
else {
|
| 1400 |
+
// Wait for the source warp to arrive on the barrier.
|
| 1401 |
+
details::sync_warps_wait_for_specific_warp(sync_location,
|
| 1402 |
+
(details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
|
| 1403 |
+
TyVal result = *warp_scratch_location;
|
| 1404 |
+
details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
|
| 1405 |
+
return result;
|
| 1406 |
+
}
|
| 1407 |
+
}
|
| 1408 |
+
|
| 1409 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1410 |
+
_CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
|
| 1411 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1412 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1413 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1414 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1415 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>();
|
| 1416 |
+
|
| 1417 |
+
warp_lambda(warp, warp_scratch_location);
|
| 1418 |
+
|
| 1419 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
|
| 1420 |
+
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
|
| 1421 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 1422 |
+
TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
|
| 1423 |
+
inter_warp_lambda(subwarp, thread_scratch_location);
|
| 1424 |
+
}
|
| 1425 |
+
warp.sync();
|
| 1426 |
+
details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
|
| 1427 |
+
}
|
| 1428 |
+
TyVal result = *warp_scratch_location;
|
| 1429 |
+
return result;
|
| 1430 |
+
}
|
| 1431 |
+
|
| 1432 |
+
public:
|
| 1433 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
|
| 1434 |
+
|
| 1435 |
+
using __static_size_tile_base<Size>::thread_rank;
|
| 1436 |
+
|
| 1437 |
+
template <typename TyVal>
|
| 1438 |
+
_CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
|
| 1439 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1440 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1441 |
+
return shfl_impl(val, src);
|
| 1442 |
+
}
|
| 1443 |
+
|
| 1444 |
+
_CG_QUALIFIER void sync() const {
|
| 1445 |
+
details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
|
| 1446 |
+
}
|
| 1447 |
+
|
| 1448 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1449 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1450 |
+
*warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
|
| 1451 |
+
};
|
| 1452 |
+
auto inter_warp_lambda =
|
| 1453 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1454 |
+
*thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1455 |
+
};
|
| 1456 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1457 |
+
}
|
| 1458 |
+
|
| 1459 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1460 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1461 |
+
*warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
|
| 1462 |
+
};
|
| 1463 |
+
auto inter_warp_lambda =
|
| 1464 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1465 |
+
*thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1466 |
+
};
|
| 1467 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1468 |
+
}
|
| 1469 |
+
};
|
| 1470 |
+
|
| 1471 |
+
|
| 1472 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1473 |
+
class __multi_warp_thread_block_tile :
|
| 1474 |
+
public __static_size_multi_warp_tile_base<Size>,
|
| 1475 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1476 |
+
{
|
| 1477 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1478 |
+
typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
|
| 1479 |
+
protected:
|
| 1480 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
|
| 1481 |
+
__static_size_multi_warp_tile_base<Size>(g) {}
|
| 1482 |
+
};
|
| 1483 |
+
|
| 1484 |
+
template <unsigned int Size>
|
| 1485 |
+
class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
|
| 1486 |
+
{
|
| 1487 |
+
const unsigned int metaGroupRank;
|
| 1488 |
+
const unsigned int metaGroupSize;
|
| 1489 |
+
|
| 1490 |
+
protected:
|
| 1491 |
+
template <unsigned int OtherSize, typename ParentT>
|
| 1492 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
|
| 1493 |
+
__static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
|
| 1494 |
+
|
| 1495 |
+
public:
|
| 1496 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1497 |
+
return metaGroupRank;
|
| 1498 |
+
}
|
| 1499 |
+
|
| 1500 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1501 |
+
return metaGroupSize;
|
| 1502 |
+
}
|
| 1503 |
+
};
|
| 1504 |
+
#endif
|
| 1505 |
+
|
| 1506 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1507 |
+
class thread_block_tile;
|
| 1508 |
+
|
| 1509 |
+
namespace details {
|
| 1510 |
+
template <unsigned int Size, typename ParentT, bool IsMultiWarp>
|
| 1511 |
+
class thread_block_tile_impl;
|
| 1512 |
+
|
| 1513 |
+
template <unsigned int Size, typename ParentT>
|
| 1514 |
+
class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
|
| 1515 |
+
{
|
| 1516 |
+
protected:
|
| 1517 |
+
template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
|
| 1518 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
|
| 1519 |
+
__single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
|
| 1520 |
+
|
| 1521 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
|
| 1522 |
+
__single_warp_thread_block_tile<Size, ParentT>() {}
|
| 1523 |
+
};
|
| 1524 |
+
|
| 1525 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1526 |
+
template <unsigned int Size, typename ParentT>
|
| 1527 |
+
class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
|
| 1528 |
+
{
|
| 1529 |
+
protected:
|
| 1530 |
+
template <typename GroupT>
|
| 1531 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
|
| 1532 |
+
__multi_warp_thread_block_tile<Size, ParentT>(g) {}
|
| 1533 |
+
};
|
| 1534 |
+
#else
|
| 1535 |
+
template <unsigned int Size, typename ParentT>
|
| 1536 |
+
class thread_block_tile_impl<Size, ParentT, true>
|
| 1537 |
+
{
|
| 1538 |
+
protected:
|
| 1539 |
+
template <typename GroupT>
|
| 1540 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
|
| 1541 |
+
};
|
| 1542 |
+
#endif
|
| 1543 |
+
}
|
| 1544 |
+
|
| 1545 |
+
template <unsigned int Size, typename ParentT>
|
| 1546 |
+
class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
|
| 1547 |
+
{
|
| 1548 |
+
friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
|
| 1549 |
+
|
| 1550 |
+
protected:
|
| 1551 |
+
_CG_QUALIFIER thread_block_tile(const ParentT& g) :
|
| 1552 |
+
details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
|
| 1553 |
+
|
| 1554 |
+
public:
|
| 1555 |
+
_CG_QUALIFIER operator thread_block_tile<Size, void>() const {
|
| 1556 |
+
return thread_block_tile<Size, void>(*this);
|
| 1557 |
+
}
|
| 1558 |
+
};
|
| 1559 |
+
|
| 1560 |
+
template <unsigned int Size>
|
| 1561 |
+
class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
|
| 1562 |
+
{
|
| 1563 |
+
template <unsigned int, typename ParentT>
|
| 1564 |
+
friend class thread_block_tile;
|
| 1565 |
+
|
| 1566 |
+
protected:
|
| 1567 |
+
template <unsigned int OtherSize, typename OtherParentT>
|
| 1568 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
|
| 1569 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1570 |
+
|
| 1571 |
+
public:
|
| 1572 |
+
template <typename ParentT>
|
| 1573 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
|
| 1574 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1575 |
+
};
|
| 1576 |
+
|
| 1577 |
+
namespace details {
|
| 1578 |
+
template <unsigned int Size, typename ParentT>
|
| 1579 |
+
struct tiled_partition_impl;
|
| 1580 |
+
|
| 1581 |
+
template <unsigned int Size>
|
| 1582 |
+
struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
|
| 1583 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
|
| 1584 |
+
thread_block_tile<Size, thread_block>(g) {}
|
| 1585 |
+
};
|
| 1586 |
+
|
| 1587 |
+
// ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
|
| 1588 |
+
template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
|
| 1589 |
+
struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
|
| 1590 |
+
public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
|
| 1591 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1592 |
+
static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
|
| 1593 |
+
#endif
|
| 1594 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
|
| 1595 |
+
thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
|
| 1596 |
+
};
|
| 1597 |
+
|
| 1598 |
+
}
|
| 1599 |
+
|
| 1600 |
+
template <unsigned int Size, typename ParentT>
|
| 1601 |
+
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
|
| 1602 |
+
{
|
| 1603 |
+
return details::tiled_partition_impl<Size, ParentT>(g);
|
| 1604 |
+
}
|
| 1605 |
+
|
| 1606 |
+
/**
|
| 1607 |
+
* thread_group this_thread()
|
| 1608 |
+
*
|
| 1609 |
+
* Constructs a generic thread_group containing only the calling thread
|
| 1610 |
+
*/
|
| 1611 |
+
_CG_QUALIFIER thread_block_tile<1, void> this_thread()
|
| 1612 |
+
{
|
| 1613 |
+
// Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
|
| 1614 |
+
// meta group rank and size set to 0 and 1 respectively.
|
| 1615 |
+
return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
|
| 1616 |
+
}
|
| 1617 |
+
|
| 1618 |
+
/**
|
| 1619 |
+
* <group_type>.sync()
|
| 1620 |
+
*
|
| 1621 |
+
* Executes a barrier across the group
|
| 1622 |
+
*
|
| 1623 |
+
* Implements both a compiler fence and an architectural fence to prevent,
|
| 1624 |
+
* memory reordering around the barrier.
|
| 1625 |
+
*/
|
| 1626 |
+
_CG_QUALIFIER void thread_group::sync() const
|
| 1627 |
+
{
|
| 1628 |
+
switch (_data.group.type) {
|
| 1629 |
+
case details::coalesced_group_id:
|
| 1630 |
+
cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
|
| 1631 |
+
break;
|
| 1632 |
+
case details::thread_block_id:
|
| 1633 |
+
cooperative_groups::sync(*static_cast<const thread_block*>(this));
|
| 1634 |
+
break;
|
| 1635 |
+
case details::grid_group_id:
|
| 1636 |
+
cooperative_groups::sync(*static_cast<const grid_group*>(this));
|
| 1637 |
+
break;
|
| 1638 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1639 |
+
case details::multi_grid_group_id:
|
| 1640 |
+
cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
|
| 1641 |
+
break;
|
| 1642 |
+
#endif
|
| 1643 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1644 |
+
case details::cluster_group_id:
|
| 1645 |
+
cooperative_groups::sync(*static_cast<const cluster_group*>(this));
|
| 1646 |
+
break;
|
| 1647 |
+
#endif
|
| 1648 |
+
default:
|
| 1649 |
+
break;
|
| 1650 |
+
}
|
| 1651 |
+
}
|
| 1652 |
+
|
| 1653 |
+
/**
|
| 1654 |
+
* <group_type>.size()
|
| 1655 |
+
*
|
| 1656 |
+
* Returns the total number of threads in the group.
|
| 1657 |
+
*/
|
| 1658 |
+
_CG_QUALIFIER unsigned long long thread_group::size() const
|
| 1659 |
+
{
|
| 1660 |
+
unsigned long long size = 0;
|
| 1661 |
+
switch (_data.group.type) {
|
| 1662 |
+
case details::coalesced_group_id:
|
| 1663 |
+
size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
|
| 1664 |
+
break;
|
| 1665 |
+
case details::thread_block_id:
|
| 1666 |
+
size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
|
| 1667 |
+
break;
|
| 1668 |
+
case details::grid_group_id:
|
| 1669 |
+
size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
|
| 1670 |
+
break;
|
| 1671 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1672 |
+
case details::multi_grid_group_id:
|
| 1673 |
+
size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
|
| 1674 |
+
break;
|
| 1675 |
+
#endif
|
| 1676 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1677 |
+
case details::cluster_group_id:
|
| 1678 |
+
size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
|
| 1679 |
+
break;
|
| 1680 |
+
#endif
|
| 1681 |
+
default:
|
| 1682 |
+
break;
|
| 1683 |
+
}
|
| 1684 |
+
return size;
|
| 1685 |
+
}
|
| 1686 |
+
|
| 1687 |
+
/**
|
| 1688 |
+
* <group_type>.thread_rank()
|
| 1689 |
+
*
|
| 1690 |
+
* Returns the linearized rank of the calling thread along the interval [0, size()).
|
| 1691 |
+
*/
|
| 1692 |
+
_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
|
| 1693 |
+
{
|
| 1694 |
+
unsigned long long rank = 0;
|
| 1695 |
+
switch (_data.group.type) {
|
| 1696 |
+
case details::coalesced_group_id:
|
| 1697 |
+
rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
|
| 1698 |
+
break;
|
| 1699 |
+
case details::thread_block_id:
|
| 1700 |
+
rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
|
| 1701 |
+
break;
|
| 1702 |
+
case details::grid_group_id:
|
| 1703 |
+
rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
|
| 1704 |
+
break;
|
| 1705 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1706 |
+
case details::multi_grid_group_id:
|
| 1707 |
+
rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
|
| 1708 |
+
break;
|
| 1709 |
+
#endif
|
| 1710 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1711 |
+
case details::cluster_group_id:
|
| 1712 |
+
rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
|
| 1713 |
+
break;
|
| 1714 |
+
#endif
|
| 1715 |
+
default:
|
| 1716 |
+
break;
|
| 1717 |
+
}
|
| 1718 |
+
return rank;
|
| 1719 |
+
}
|
| 1720 |
+
|
| 1721 |
+
_CG_END_NAMESPACE
|
| 1722 |
+
|
| 1723 |
+
#include <cooperative_groups/details/partitioning.h>
|
| 1724 |
+
#if (!defined(_MSC_VER) || defined(_WIN64))
|
| 1725 |
+
# include <cooperative_groups/details/invoke.h>
|
| 1726 |
+
#endif
|
| 1727 |
+
|
| 1728 |
+
# endif /* ! (__cplusplus, __CUDACC__) */
|
| 1729 |
+
|
| 1730 |
+
#endif /* !_COOPERATIVE_GROUPS_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_COALESCED_REDUCE_H_
|
| 50 |
+
#define _CG_COALESCED_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "cooperative_groups.h"
|
| 55 |
+
#include "partitioning.h"
|
| 56 |
+
#include "coalesced_scan.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
|
| 63 |
+
_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
|
| 64 |
+
TyVal&& val,
|
| 65 |
+
TyOp&& op) -> decltype(op(val, val)) {
|
| 66 |
+
auto out = val;
|
| 67 |
+
for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
|
| 68 |
+
out = op(out, group.shfl_xor(out, mask));
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
return out;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
template <typename TyVal, typename TyOp>
|
| 75 |
+
_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 76 |
+
if (group.size() == 32) {
|
| 77 |
+
// Full coalesced group can go through faster path by being treated as a tile of size 32
|
| 78 |
+
auto tile = details::tiled_partition_internal<32, void>();
|
| 79 |
+
return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 80 |
+
}
|
| 81 |
+
else {
|
| 82 |
+
auto scan_result =
|
| 83 |
+
inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 84 |
+
unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
|
| 85 |
+
unsigned int last_thread_id = 31 - __clz(group_mask);
|
| 86 |
+
return details::tile::shuffle_dispatch<TyVal>::shfl(
|
| 87 |
+
_CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
} // details
|
| 92 |
+
|
| 93 |
+
_CG_END_NAMESPACE
|
| 94 |
+
|
| 95 |
+
#endif // _CG_COALESCED_REDUCE_H_
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_FUNCTIONAL_H
|
| 50 |
+
#define _CG_FUNCTIONAL_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
#ifdef _CG_USE_CUDA_STL
|
| 57 |
+
# include <cuda/std/functional>
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
_CG_BEGIN_NAMESPACE
|
| 61 |
+
|
| 62 |
+
namespace details {
|
| 63 |
+
#ifdef _CG_USE_CUDA_STL
|
| 64 |
+
using cuda::std::plus;
|
| 65 |
+
using cuda::std::bit_and;
|
| 66 |
+
using cuda::std::bit_xor;
|
| 67 |
+
using cuda::std::bit_or;
|
| 68 |
+
#else
|
| 69 |
+
template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
|
| 70 |
+
template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
|
| 71 |
+
template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
|
| 72 |
+
template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
|
| 73 |
+
#endif // _CG_USE_PLATFORM_STL
|
| 74 |
+
} // details
|
| 75 |
+
|
| 76 |
+
template <typename Ty>
|
| 77 |
+
struct plus : public details::plus<Ty> {};
|
| 78 |
+
|
| 79 |
+
template <typename Ty>
|
| 80 |
+
struct less {
|
| 81 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 82 |
+
return (arg2 < arg1) ? arg2 : arg1;
|
| 83 |
+
}
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
template <typename Ty>
|
| 87 |
+
struct greater {
|
| 88 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 89 |
+
return (arg1 < arg2) ? arg2 : arg1;
|
| 90 |
+
}
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
template <typename Ty>
|
| 94 |
+
struct bit_and : public details::bit_and<Ty> {};
|
| 95 |
+
|
| 96 |
+
template <typename Ty>
|
| 97 |
+
struct bit_xor : public details::bit_xor<Ty> {};
|
| 98 |
+
|
| 99 |
+
template <typename Ty>
|
| 100 |
+
struct bit_or : public details::bit_or<Ty> {};
|
| 101 |
+
|
| 102 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 103 |
+
namespace details {
|
| 104 |
+
template <class Ty>
|
| 105 |
+
using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 106 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
|
| 107 |
+
|
| 108 |
+
template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 109 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 110 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 111 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 112 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 113 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 114 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 115 |
+
|
| 116 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 117 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
|
| 118 |
+
auto old = atomic.load(cuda::std::memory_order_relaxed);
|
| 119 |
+
while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
|
| 120 |
+
return old;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
template<typename TyOp>
|
| 124 |
+
struct op_picker;
|
| 125 |
+
|
| 126 |
+
template<typename TyVal>
|
| 127 |
+
struct op_picker<cooperative_groups::plus<TyVal>> {
|
| 128 |
+
template<typename TyAtomic>
|
| 129 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 130 |
+
return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
|
| 131 |
+
}
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
template<typename TyVal>
|
| 135 |
+
struct op_picker<cooperative_groups::less<TyVal>> {
|
| 136 |
+
template<typename TyAtomic>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 138 |
+
return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
template<typename TyVal>
|
| 143 |
+
struct op_picker<cooperative_groups::greater<TyVal>> {
|
| 144 |
+
template<typename TyAtomic>
|
| 145 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 146 |
+
return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
|
| 147 |
+
}
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
template<typename TyVal>
|
| 151 |
+
struct op_picker<cooperative_groups::bit_and<TyVal>> {
|
| 152 |
+
template<typename TyAtomic>
|
| 153 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 154 |
+
return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
|
| 155 |
+
}
|
| 156 |
+
};
|
| 157 |
+
|
| 158 |
+
template<typename TyVal>
|
| 159 |
+
struct op_picker<cooperative_groups::bit_xor<TyVal>> {
|
| 160 |
+
template<typename TyAtomic>
|
| 161 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 162 |
+
return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
|
| 163 |
+
}
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
template<typename TyVal>
|
| 167 |
+
struct op_picker<cooperative_groups::bit_or<TyVal>> {
|
| 168 |
+
template<typename TyAtomic>
|
| 169 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 170 |
+
return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
|
| 171 |
+
}
|
| 172 |
+
};
|
| 173 |
+
|
| 174 |
+
template<bool atomic_supported>
|
| 175 |
+
struct atomic_update_dispatch {};
|
| 176 |
+
|
| 177 |
+
template<>
|
| 178 |
+
struct atomic_update_dispatch<false> {
|
| 179 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 180 |
+
_CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 181 |
+
return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 182 |
+
}
|
| 183 |
+
};
|
| 184 |
+
|
| 185 |
+
template<>
|
| 186 |
+
struct atomic_update_dispatch<true> {
|
| 187 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 188 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
|
| 189 |
+
using dispatch = op_picker<details::remove_qual<TyOp>>;
|
| 190 |
+
|
| 191 |
+
return dispatch::atomic_update(atomic, val);
|
| 192 |
+
}
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 196 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 197 |
+
using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
|
| 198 |
+
|
| 199 |
+
return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
template<typename TyAtomic, typename TyVal>
|
| 203 |
+
_CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
|
| 204 |
+
atomic.store(val, cuda::std::memory_order_relaxed);
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
#endif
|
| 208 |
+
|
| 209 |
+
_CG_END_NAMESPACE
|
| 210 |
+
|
| 211 |
+
#endif
|
| 212 |
+
#endif //_CG_FUNCTIONAL_H
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_HELPERS_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "sync.h"
|
| 54 |
+
|
| 55 |
+
_CG_BEGIN_NAMESPACE
|
| 56 |
+
|
| 57 |
+
namespace details {
|
| 58 |
+
#ifdef _CG_CPP11_FEATURES
|
| 59 |
+
template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
|
| 60 |
+
# ifdef _CG_HAS_FP16_COLLECTIVE
|
| 61 |
+
template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
|
| 62 |
+
template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
|
| 63 |
+
# endif
|
| 64 |
+
template <typename Ty>
|
| 65 |
+
using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
|
| 66 |
+
|
| 67 |
+
// Non-STL utility templates
|
| 68 |
+
template <typename Ty>
|
| 69 |
+
using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
|
| 70 |
+
|
| 71 |
+
template <typename TyLhs, typename TyRhs>
|
| 72 |
+
using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
|
| 73 |
+
>;
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
template <typename TyTrunc>
|
| 77 |
+
_CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
|
| 78 |
+
return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
|
| 79 |
+
((TyTrunc)index.y * nIndex.x) +
|
| 80 |
+
(TyTrunc)index.x;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
namespace cta {
|
| 84 |
+
|
| 85 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 86 |
+
{
|
| 87 |
+
__barrier_sync(0);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 91 |
+
{
|
| 92 |
+
return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 96 |
+
{
|
| 97 |
+
return vec3_to_linear<unsigned int>(threadIdx, blockDim);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
_CG_STATIC_QUALIFIER dim3 group_index()
|
| 101 |
+
{
|
| 102 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 106 |
+
{
|
| 107 |
+
return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 111 |
+
{
|
| 112 |
+
return dim3(blockDim.x, blockDim.y, blockDim.z);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Legacy aliases
|
| 116 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 117 |
+
{
|
| 118 |
+
return num_threads();
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
_CG_STATIC_QUALIFIER dim3 block_dim()
|
| 122 |
+
{
|
| 123 |
+
return dim_threads();
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
class _coalesced_group_data_access {
|
| 129 |
+
public:
|
| 130 |
+
// Retrieve mask of coalesced groups and tiles
|
| 131 |
+
template <typename TyGroup>
|
| 132 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
|
| 133 |
+
return group.get_mask();
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
template <typename TyGroup>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
|
| 138 |
+
return TyGroup(mask);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
template <typename TyGroup>
|
| 142 |
+
_CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
|
| 143 |
+
group._data.coalesced.metaGroupRank = mgRank;
|
| 144 |
+
group._data.coalesced.metaGroupSize = mgSize;
|
| 145 |
+
}
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
namespace tile {
|
| 149 |
+
template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
|
| 150 |
+
struct _tile_helpers{
|
| 151 |
+
_CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
|
| 152 |
+
_CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
|
| 153 |
+
_CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
|
| 154 |
+
_CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
|
| 155 |
+
};
|
| 156 |
+
|
| 157 |
+
template <unsigned int> struct tile_helpers;
|
| 158 |
+
template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
|
| 159 |
+
template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
|
| 160 |
+
template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
|
| 161 |
+
template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
|
| 162 |
+
template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
|
| 163 |
+
template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
|
| 164 |
+
|
| 165 |
+
#ifdef _CG_CPP11_FEATURES
|
| 166 |
+
namespace shfl {
|
| 167 |
+
/***********************************************************************************
|
| 168 |
+
* Recursively Sliced Shuffle
|
| 169 |
+
* Purpose:
|
| 170 |
+
* Slices an input type a number of times into integral types so that shuffles
|
| 171 |
+
* are well defined
|
| 172 |
+
* Expectations:
|
| 173 |
+
* This object *should not* be used from a reinterpret_cast pointer unless
|
| 174 |
+
* some alignment guarantees can be met. Use a memcpy to guarantee that loads
|
| 175 |
+
* from the integral types stored within are aligned and correct.
|
| 176 |
+
**********************************************************************************/
|
| 177 |
+
template <unsigned int count, bool intSized = (count <= sizeof(int))>
|
| 178 |
+
struct recursive_sliced_shuffle_helper;
|
| 179 |
+
|
| 180 |
+
template <unsigned int count>
|
| 181 |
+
struct recursive_sliced_shuffle_helper<count, true> {
|
| 182 |
+
int val;
|
| 183 |
+
|
| 184 |
+
template <typename TyFn>
|
| 185 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 186 |
+
val = shfl(val);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
template <unsigned int count>
|
| 191 |
+
struct recursive_sliced_shuffle_helper<count, false> {
|
| 192 |
+
int val;
|
| 193 |
+
recursive_sliced_shuffle_helper<count - sizeof(int)> next;
|
| 194 |
+
|
| 195 |
+
template <typename TyFn>
|
| 196 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 197 |
+
val = shfl(val);
|
| 198 |
+
next.invoke_shuffle(shfl);
|
| 199 |
+
}
|
| 200 |
+
};
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
struct _memory_shuffle {
|
| 204 |
+
template <typename TyElem, typename TyShflFn>
|
| 205 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 206 |
+
static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
|
| 207 |
+
return TyElem{};
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 211 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 212 |
+
auto shfl = [=](int val) -> int {
|
| 213 |
+
return 0;
|
| 214 |
+
};
|
| 215 |
+
|
| 216 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 220 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 221 |
+
auto shfl = [=](int val) -> int {
|
| 222 |
+
return 0;
|
| 223 |
+
};
|
| 224 |
+
|
| 225 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 229 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 230 |
+
auto shfl = [=](int val) -> int {
|
| 231 |
+
return 0;
|
| 232 |
+
};
|
| 233 |
+
|
| 234 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 238 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 239 |
+
auto shfl = [=](int val) -> int {
|
| 240 |
+
return 0;
|
| 241 |
+
};
|
| 242 |
+
|
| 243 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 244 |
+
}
|
| 245 |
+
};
|
| 246 |
+
|
| 247 |
+
/***********************************************************************************
|
| 248 |
+
* Intrinsic Device Function Shuffle
|
| 249 |
+
* Purpose:
|
| 250 |
+
* Uses a shuffle helper that has characteristics best suited for moving
|
| 251 |
+
* elements between threads
|
| 252 |
+
* Expectations:
|
| 253 |
+
* Object given will be forced into an l-value type so that it can be used
|
| 254 |
+
* with a helper structure that reinterprets the data into intrinsic compatible
|
| 255 |
+
* types
|
| 256 |
+
* Notes:
|
| 257 |
+
* !! TyRet is required so that objects are returned by value and not as
|
| 258 |
+
* dangling references depending on the value category of the passed object
|
| 259 |
+
**********************************************************************************/
|
| 260 |
+
struct _intrinsic_compat_shuffle {
|
| 261 |
+
template <unsigned int count>
|
| 262 |
+
using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
|
| 263 |
+
|
| 264 |
+
template <typename TyElem, typename TyShflFn>
|
| 265 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 266 |
+
static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
|
| 267 |
+
shfl_helper<sizeof(TyElem)> helper;
|
| 268 |
+
memcpy(&helper, &elem, sizeof(TyElem));
|
| 269 |
+
helper.invoke_shuffle(fn);
|
| 270 |
+
memcpy(&elem, &helper, sizeof(TyElem));
|
| 271 |
+
return elem;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 275 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 276 |
+
auto shfl = [=](int val) -> int {
|
| 277 |
+
return __shfl_sync(gMask, val, srcRank, threads);
|
| 278 |
+
};
|
| 279 |
+
|
| 280 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 284 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 285 |
+
auto shfl = [=](int val) -> int {
|
| 286 |
+
return __shfl_down_sync(gMask, val, delta, threads);
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 293 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 294 |
+
auto shfl = [=](int val) -> int {
|
| 295 |
+
return __shfl_up_sync(gMask, val, delta, threads);
|
| 296 |
+
};
|
| 297 |
+
|
| 298 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 302 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 303 |
+
auto shfl = [=](int val) -> int {
|
| 304 |
+
return __shfl_xor_sync(gMask, val, lMask, threads);
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 308 |
+
}
|
| 309 |
+
};
|
| 310 |
+
|
| 311 |
+
struct _native_shuffle {
|
| 312 |
+
template <typename TyElem>
|
| 313 |
+
_CG_STATIC_QUALIFIER TyElem shfl(
|
| 314 |
+
TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 315 |
+
return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
template <typename TyElem>
|
| 319 |
+
_CG_STATIC_QUALIFIER TyElem shfl_down(
|
| 320 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 321 |
+
return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
template <typename TyElem>
|
| 325 |
+
_CG_STATIC_QUALIFIER TyElem shfl_up(
|
| 326 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 327 |
+
return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template <typename TyElem>
|
| 331 |
+
_CG_STATIC_QUALIFIER TyElem shfl_xor(
|
| 332 |
+
TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 333 |
+
return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
|
| 334 |
+
}
|
| 335 |
+
};
|
| 336 |
+
|
| 337 |
+
// Almost all arithmetic types are supported by native shuffle
|
| 338 |
+
// Vector types are the exception
|
| 339 |
+
template <typename TyElem>
|
| 340 |
+
using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
|
| 341 |
+
bool,
|
| 342 |
+
_CG_STL_NAMESPACE::is_integral<
|
| 343 |
+
remove_qual<TyElem>>::value ||
|
| 344 |
+
details::is_float_or_half<
|
| 345 |
+
remove_qual<TyElem>>::value
|
| 346 |
+
>;
|
| 347 |
+
|
| 348 |
+
constexpr unsigned long long _MemoryShuffleCutoff = 32;
|
| 349 |
+
|
| 350 |
+
template <typename TyElem,
|
| 351 |
+
bool IsNative = use_native_shuffle<TyElem>::value,
|
| 352 |
+
bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
|
| 353 |
+
struct shuffle_dispatch;
|
| 354 |
+
|
| 355 |
+
template <typename TyElem>
|
| 356 |
+
struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
|
| 357 |
+
|
| 358 |
+
template <typename TyElem>
|
| 359 |
+
struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
|
| 360 |
+
|
| 361 |
+
template <typename TyElem>
|
| 362 |
+
struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
|
| 363 |
+
|
| 364 |
+
#endif //_CG_CPP11_FEATURES
|
| 365 |
+
};
|
| 366 |
+
|
| 367 |
+
namespace multi_grid {
|
| 368 |
+
struct multi_grid_functions;
|
| 369 |
+
};
|
| 370 |
+
|
| 371 |
+
namespace grid {
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
|
| 373 |
+
return details::sync_grids_arrive(bar);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
|
| 377 |
+
details::sync_grids_wait(token, bar);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
|
| 381 |
+
unsigned int token = details::sync_grids_arrive(bar);
|
| 382 |
+
details::sync_grids_wait(token, bar);
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks()
|
| 386 |
+
{
|
| 387 |
+
// grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
|
| 388 |
+
// grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
|
| 389 |
+
return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads()
|
| 393 |
+
{
|
| 394 |
+
return num_blocks() * cta::num_threads();
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank()
|
| 398 |
+
{
|
| 399 |
+
return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank()
|
| 403 |
+
{
|
| 404 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 408 |
+
{
|
| 409 |
+
return dim3(gridDim.x, gridDim.y, gridDim.z);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 413 |
+
{
|
| 414 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 418 |
+
{
|
| 419 |
+
return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 423 |
+
{
|
| 424 |
+
return dim3(blockIdx.x * blockDim.x + threadIdx.x,
|
| 425 |
+
blockIdx.y * blockDim.y + threadIdx.y,
|
| 426 |
+
blockIdx.z * blockDim.z + threadIdx.z);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 431 |
+
return __clusterGridDimInClusters();
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 435 |
+
const dim3 dimClusters = dim_clusters();
|
| 436 |
+
return dimClusters.x * dimClusters.y * dimClusters.z;
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 440 |
+
return __clusterIdx();
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 444 |
+
return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
|
| 445 |
+
}
|
| 446 |
+
#endif
|
| 447 |
+
|
| 448 |
+
// Legacy aliases
|
| 449 |
+
_CG_STATIC_QUALIFIER unsigned long long size()
|
| 450 |
+
{
|
| 451 |
+
return num_threads();
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
_CG_STATIC_QUALIFIER dim3 grid_dim()
|
| 455 |
+
{
|
| 456 |
+
return dim_blocks();
|
| 457 |
+
}
|
| 458 |
+
};
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 462 |
+
|
| 463 |
+
namespace multi_grid {
|
| 464 |
+
_CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
|
| 465 |
+
{
|
| 466 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 467 |
+
//this function is defined in device runtime library
|
| 468 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 469 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 470 |
+
return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
|
| 471 |
+
#else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
|
| 472 |
+
return 0;
|
| 473 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
_CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
|
| 477 |
+
{
|
| 478 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 479 |
+
//this function is defined in device runtime library
|
| 480 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 481 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 482 |
+
cudaError_t err = cudaCGSynchronize(handle, 0);
|
| 483 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
_CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
|
| 487 |
+
{
|
| 488 |
+
unsigned int numThreads = 0;
|
| 489 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 490 |
+
//this function is defined in device runtime library
|
| 491 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 492 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 493 |
+
cudaCGGetSize(&numThreads, NULL, handle);
|
| 494 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 495 |
+
return numThreads;
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
|
| 499 |
+
{
|
| 500 |
+
unsigned int threadRank = 0;
|
| 501 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 502 |
+
//this function is defined in device runtime library
|
| 503 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 504 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 505 |
+
cudaCGGetRank(&threadRank, NULL, handle);
|
| 506 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 507 |
+
return threadRank;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
_CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
|
| 511 |
+
{
|
| 512 |
+
unsigned int gridRank = 0;
|
| 513 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 514 |
+
//this function is defined in device runtime library
|
| 515 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 516 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 517 |
+
cudaCGGetRank(NULL, &gridRank, handle);
|
| 518 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 519 |
+
return gridRank;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
_CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
|
| 523 |
+
{
|
| 524 |
+
unsigned int numGrids = 0;
|
| 525 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 526 |
+
//this function is defined in device runtime library
|
| 527 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 528 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 529 |
+
cudaCGGetSize(NULL, &numGrids, handle);
|
| 530 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 531 |
+
return numGrids;
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
# ifdef _CG_CPP11_FEATURES
|
| 535 |
+
struct multi_grid_functions {
|
| 536 |
+
decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
|
| 537 |
+
decltype(multi_grid::sync) *sync;
|
| 538 |
+
decltype(multi_grid::size) *size;
|
| 539 |
+
decltype(multi_grid::thread_rank) *thread_rank;
|
| 540 |
+
decltype(multi_grid::grid_rank) *grid_rank;
|
| 541 |
+
decltype(multi_grid::num_grids) *num_grids;
|
| 542 |
+
};
|
| 543 |
+
|
| 544 |
+
template <typename = void>
|
| 545 |
+
_CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
|
| 546 |
+
__constant__ static const multi_grid_functions mgf {
|
| 547 |
+
&multi_grid::get_intrinsic_handle,
|
| 548 |
+
&multi_grid::sync,
|
| 549 |
+
&multi_grid::size,
|
| 550 |
+
&multi_grid::thread_rank,
|
| 551 |
+
&multi_grid::grid_rank,
|
| 552 |
+
&multi_grid::num_grids
|
| 553 |
+
};
|
| 554 |
+
|
| 555 |
+
return &mgf;
|
| 556 |
+
}
|
| 557 |
+
# endif
|
| 558 |
+
};
|
| 559 |
+
#endif
|
| 560 |
+
|
| 561 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 562 |
+
namespace cluster {
|
| 563 |
+
|
| 564 |
+
_CG_STATIC_QUALIFIER bool isReal()
|
| 565 |
+
{
|
| 566 |
+
return __clusterDimIsSpecified();
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
_CG_STATIC_QUALIFIER void barrier_arrive()
|
| 570 |
+
{
|
| 571 |
+
__cluster_barrier_arrive();
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 575 |
+
{
|
| 576 |
+
__cluster_barrier_wait();
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 580 |
+
{
|
| 581 |
+
barrier_arrive();
|
| 582 |
+
barrier_wait();
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 586 |
+
{
|
| 587 |
+
return __cluster_query_shared_rank(addr);
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
template <typename T>
|
| 591 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 592 |
+
{
|
| 593 |
+
return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 597 |
+
{
|
| 598 |
+
return __clusterRelativeBlockIdx();
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 602 |
+
{
|
| 603 |
+
return __clusterRelativeBlockRank();
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 607 |
+
{
|
| 608 |
+
const dim3 blockIndex = block_index();
|
| 609 |
+
return dim3(blockIndex.x * blockDim.x + threadIdx.x,
|
| 610 |
+
blockIndex.y * blockDim.y + threadIdx.y,
|
| 611 |
+
blockIndex.z * blockDim.z + threadIdx.z);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 615 |
+
{
|
| 616 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 620 |
+
{
|
| 621 |
+
return __clusterDim();
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 625 |
+
{
|
| 626 |
+
return __clusterSizeInBlocks();
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 630 |
+
{
|
| 631 |
+
const dim3 dimBlocks = dim_blocks();
|
| 632 |
+
const unsigned int x = dimBlocks.x * blockDim.x;
|
| 633 |
+
const unsigned int y = dimBlocks.y * blockDim.y;
|
| 634 |
+
const unsigned int z = dimBlocks.z * blockDim.z;
|
| 635 |
+
return dim3(x, y, z);
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 639 |
+
{
|
| 640 |
+
return num_blocks() * cta::num_threads();
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
};
|
| 644 |
+
#endif
|
| 645 |
+
|
| 646 |
+
_CG_STATIC_QUALIFIER unsigned int laneid()
|
| 647 |
+
{
|
| 648 |
+
unsigned int laneid;
|
| 649 |
+
asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
|
| 650 |
+
return laneid;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
|
| 654 |
+
{
|
| 655 |
+
unsigned int lanemask32_eq;
|
| 656 |
+
asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
|
| 657 |
+
return (lanemask32_eq);
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
|
| 661 |
+
{
|
| 662 |
+
unsigned int lanemask32_lt;
|
| 663 |
+
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
|
| 664 |
+
return (lanemask32_lt);
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
_CG_STATIC_QUALIFIER void abort()
|
| 668 |
+
{
|
| 669 |
+
_CG_ABORT();
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
template <typename Ty>
|
| 673 |
+
_CG_QUALIFIER void assert_if_not_arithmetic() {
|
| 674 |
+
#ifdef _CG_CPP11_FEATURES
|
| 675 |
+
static_assert(
|
| 676 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value ||
|
| 677 |
+
details::is_float_or_half<Ty>::value,
|
| 678 |
+
"Error: Ty is neither integer or float"
|
| 679 |
+
);
|
| 680 |
+
#endif //_CG_CPP11_FEATURES
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
#ifdef _CG_CPP11_FEATURES
|
| 684 |
+
_CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
|
| 685 |
+
return x == 1 ? 0 : 1 + log2(x / 2);
|
| 686 |
+
}
|
| 687 |
+
#endif //_CG_CPP11_FEATURES
|
| 688 |
+
|
| 689 |
+
}; // !Namespace internal
|
| 690 |
+
|
| 691 |
+
_CG_END_NAMESPACE
|
| 692 |
+
|
| 693 |
+
#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 50 |
+
#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/async.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_REDUCE_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_REDUCE_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/reduce.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_REDUCE_H
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_SCAN_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_SCAN_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/scan.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_SCAN_H
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(CU_COMPLEX_H_)
|
| 51 |
+
#define CU_COMPLEX_H_
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDACC_RTC__)
|
| 54 |
+
#if defined(__GNUC__)
|
| 55 |
+
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
|
| 56 |
+
#pragma GCC diagnostic ignored "-Wunused-function"
|
| 57 |
+
#endif
|
| 58 |
+
#endif
|
| 59 |
+
#endif
|
| 60 |
+
|
| 61 |
+
/* When trying to include C header file in C++ Code extern "C" is required
|
| 62 |
+
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
|
| 63 |
+
* extern "C" cannot be nested
|
| 64 |
+
* Hence keep the header out of extern "C" block
|
| 65 |
+
*/
|
| 66 |
+
|
| 67 |
+
#if !defined(__CUDACC__)
|
| 68 |
+
#include <math.h> /* import fabsf, sqrt */
|
| 69 |
+
#endif /* !defined(__CUDACC__) */
|
| 70 |
+
|
| 71 |
+
#if defined(__cplusplus)
|
| 72 |
+
extern "C" {
|
| 73 |
+
#endif /* __cplusplus */
|
| 74 |
+
|
| 75 |
+
#include "vector_types.h"
|
| 76 |
+
|
| 77 |
+
typedef float2 cuFloatComplex;
|
| 78 |
+
|
| 79 |
+
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
|
| 80 |
+
{
|
| 81 |
+
return x.x;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
|
| 85 |
+
{
|
| 86 |
+
return x.y;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
|
| 90 |
+
(float r, float i)
|
| 91 |
+
{
|
| 92 |
+
cuFloatComplex res;
|
| 93 |
+
res.x = r;
|
| 94 |
+
res.y = i;
|
| 95 |
+
return res;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
|
| 99 |
+
{
|
| 100 |
+
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
|
| 101 |
+
}
|
| 102 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
|
| 103 |
+
cuFloatComplex y)
|
| 104 |
+
{
|
| 105 |
+
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
|
| 106 |
+
cuCimagf(x) + cuCimagf(y));
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
|
| 110 |
+
cuFloatComplex y)
|
| 111 |
+
{
|
| 112 |
+
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
|
| 113 |
+
cuCimagf(x) - cuCimagf(y));
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* This implementation could suffer from intermediate overflow even though
|
| 117 |
+
* the final result would be in range. However, various implementations do
|
| 118 |
+
* not guard against this (presumably to avoid losing performance), so we
|
| 119 |
+
* don't do it either to stay competitive.
|
| 120 |
+
*/
|
| 121 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
|
| 122 |
+
cuFloatComplex y)
|
| 123 |
+
{
|
| 124 |
+
cuFloatComplex prod;
|
| 125 |
+
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
|
| 126 |
+
(cuCimagf(x) * cuCimagf(y)),
|
| 127 |
+
(cuCrealf(x) * cuCimagf(y)) +
|
| 128 |
+
(cuCimagf(x) * cuCrealf(y)));
|
| 129 |
+
return prod;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 133 |
+
* by scaling. Such guarded implementations are usually the default for
|
| 134 |
+
* complex library implementations, with some also offering an unguarded,
|
| 135 |
+
* faster version.
|
| 136 |
+
*/
|
| 137 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
|
| 138 |
+
cuFloatComplex y)
|
| 139 |
+
{
|
| 140 |
+
cuFloatComplex quot;
|
| 141 |
+
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
|
| 142 |
+
float oos = 1.0f / s;
|
| 143 |
+
float ars = cuCrealf(x) * oos;
|
| 144 |
+
float ais = cuCimagf(x) * oos;
|
| 145 |
+
float brs = cuCrealf(y) * oos;
|
| 146 |
+
float bis = cuCimagf(y) * oos;
|
| 147 |
+
s = (brs * brs) + (bis * bis);
|
| 148 |
+
oos = 1.0f / s;
|
| 149 |
+
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
|
| 150 |
+
((ais * brs) - (ars * bis)) * oos);
|
| 151 |
+
return quot;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
/*
|
| 155 |
+
* We would like to call hypotf(), but it's not available on all platforms.
|
| 156 |
+
* This discrete implementation guards against intermediate underflow and
|
| 157 |
+
* overflow by scaling. Otherwise we would lose half the exponent range.
|
| 158 |
+
* There are various ways of doing guarded computation. For now chose the
|
| 159 |
+
* simplest and fastest solution, however this may suffer from inaccuracies
|
| 160 |
+
* if sqrt and division are not IEEE compliant.
|
| 161 |
+
*/
|
| 162 |
+
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
|
| 163 |
+
{
|
| 164 |
+
float a = cuCrealf(x);
|
| 165 |
+
float b = cuCimagf(x);
|
| 166 |
+
float v, w, t;
|
| 167 |
+
a = fabsf(a);
|
| 168 |
+
b = fabsf(b);
|
| 169 |
+
if (a > b) {
|
| 170 |
+
v = a;
|
| 171 |
+
w = b;
|
| 172 |
+
} else {
|
| 173 |
+
v = b;
|
| 174 |
+
w = a;
|
| 175 |
+
}
|
| 176 |
+
t = w / v;
|
| 177 |
+
t = 1.0f + t * t;
|
| 178 |
+
t = v * sqrtf(t);
|
| 179 |
+
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
|
| 180 |
+
t = v + w;
|
| 181 |
+
}
|
| 182 |
+
return t;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
/* Double precision */
|
| 186 |
+
typedef double2 cuDoubleComplex;
|
| 187 |
+
|
| 188 |
+
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
|
| 189 |
+
{
|
| 190 |
+
return x.x;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
|
| 194 |
+
{
|
| 195 |
+
return x.y;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
|
| 199 |
+
(double r, double i)
|
| 200 |
+
{
|
| 201 |
+
cuDoubleComplex res;
|
| 202 |
+
res.x = r;
|
| 203 |
+
res.y = i;
|
| 204 |
+
return res;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
|
| 208 |
+
{
|
| 209 |
+
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
|
| 213 |
+
cuDoubleComplex y)
|
| 214 |
+
{
|
| 215 |
+
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
|
| 216 |
+
cuCimag(x) + cuCimag(y));
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
|
| 220 |
+
cuDoubleComplex y)
|
| 221 |
+
{
|
| 222 |
+
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
|
| 223 |
+
cuCimag(x) - cuCimag(y));
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
/* This implementation could suffer from intermediate overflow even though
|
| 227 |
+
* the final result would be in range. However, various implementations do
|
| 228 |
+
* not guard against this (presumably to avoid losing performance), so we
|
| 229 |
+
* don't do it either to stay competitive.
|
| 230 |
+
*/
|
| 231 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
|
| 232 |
+
cuDoubleComplex y)
|
| 233 |
+
{
|
| 234 |
+
cuDoubleComplex prod;
|
| 235 |
+
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
|
| 236 |
+
(cuCimag(x) * cuCimag(y)),
|
| 237 |
+
(cuCreal(x) * cuCimag(y)) +
|
| 238 |
+
(cuCimag(x) * cuCreal(y)));
|
| 239 |
+
return prod;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 243 |
+
* by scaling. Such guarded implementations are usually the default for
|
| 244 |
+
* complex library implementations, with some also offering an unguarded,
|
| 245 |
+
* faster version.
|
| 246 |
+
*/
|
| 247 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
|
| 248 |
+
cuDoubleComplex y)
|
| 249 |
+
{
|
| 250 |
+
cuDoubleComplex quot;
|
| 251 |
+
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
|
| 252 |
+
double oos = 1.0 / s;
|
| 253 |
+
double ars = cuCreal(x) * oos;
|
| 254 |
+
double ais = cuCimag(x) * oos;
|
| 255 |
+
double brs = cuCreal(y) * oos;
|
| 256 |
+
double bis = cuCimag(y) * oos;
|
| 257 |
+
s = (brs * brs) + (bis * bis);
|
| 258 |
+
oos = 1.0 / s;
|
| 259 |
+
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
|
| 260 |
+
((ais * brs) - (ars * bis)) * oos);
|
| 261 |
+
return quot;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 265 |
+
* by scaling. Otherwise we would lose half the exponent range. There are
|
| 266 |
+
* various ways of doing guarded computation. For now chose the simplest
|
| 267 |
+
* and fastest solution, however this may suffer from inaccuracies if sqrt
|
| 268 |
+
* and division are not IEEE compliant.
|
| 269 |
+
*/
|
| 270 |
+
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
|
| 271 |
+
{
|
| 272 |
+
double a = cuCreal(x);
|
| 273 |
+
double b = cuCimag(x);
|
| 274 |
+
double v, w, t;
|
| 275 |
+
a = fabs(a);
|
| 276 |
+
b = fabs(b);
|
| 277 |
+
if (a > b) {
|
| 278 |
+
v = a;
|
| 279 |
+
w = b;
|
| 280 |
+
} else {
|
| 281 |
+
v = b;
|
| 282 |
+
w = a;
|
| 283 |
+
}
|
| 284 |
+
t = w / v;
|
| 285 |
+
t = 1.0 + t * t;
|
| 286 |
+
t = v * sqrt(t);
|
| 287 |
+
if ((v == 0.0) ||
|
| 288 |
+
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
|
| 289 |
+
t = v + w;
|
| 290 |
+
}
|
| 291 |
+
return t;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
#if defined(__cplusplus)
|
| 295 |
+
}
|
| 296 |
+
#endif /* __cplusplus */
|
| 297 |
+
|
| 298 |
+
/* aliases */
|
| 299 |
+
typedef cuFloatComplex cuComplex;
|
| 300 |
+
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
|
| 301 |
+
float y)
|
| 302 |
+
{
|
| 303 |
+
return make_cuFloatComplex (x, y);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
/* float-to-double promotion */
|
| 307 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
|
| 308 |
+
(cuFloatComplex c)
|
| 309 |
+
{
|
| 310 |
+
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
|
| 314 |
+
(cuDoubleComplex c)
|
| 315 |
+
{
|
| 316 |
+
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
|
| 321 |
+
{
|
| 322 |
+
float real_res;
|
| 323 |
+
float imag_res;
|
| 324 |
+
|
| 325 |
+
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
|
| 326 |
+
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
|
| 327 |
+
|
| 328 |
+
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
|
| 329 |
+
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
|
| 330 |
+
|
| 331 |
+
return make_cuComplex(real_res, imag_res);
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
|
| 335 |
+
{
|
| 336 |
+
double real_res;
|
| 337 |
+
double imag_res;
|
| 338 |
+
|
| 339 |
+
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
|
| 340 |
+
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
|
| 341 |
+
|
| 342 |
+
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
|
| 343 |
+
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
|
| 344 |
+
|
| 345 |
+
return make_cuDoubleComplex(real_res, imag_res);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
#endif /* !defined(CU_COMPLEX_H_) */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAEGL_H
|
| 51 |
+
#define CUDAEGL_H
|
| 52 |
+
|
| 53 |
+
#include "cuda.h"
|
| 54 |
+
#include "EGL/egl.h"
|
| 55 |
+
#include "EGL/eglext.h"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
#ifdef CUDA_FORCE_API_VERSION
|
| 59 |
+
#error "CUDA_FORCE_API_VERSION is no longer supported."
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#ifdef __cplusplus
|
| 63 |
+
extern "C" {
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* \addtogroup CUDA_TYPES
|
| 68 |
+
* @{
|
| 69 |
+
*/
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Maximum number of planes per frame
|
| 73 |
+
*/
|
| 74 |
+
#define MAX_PLANES 3
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* CUDA EglFrame type - array or pointer
|
| 78 |
+
*/
|
| 79 |
+
typedef enum CUeglFrameType_enum {
|
| 80 |
+
CU_EGL_FRAME_TYPE_ARRAY = 0, /**< Frame type CUDA array */
|
| 81 |
+
CU_EGL_FRAME_TYPE_PITCH = 1, /**< Frame type pointer */
|
| 82 |
+
} CUeglFrameType;
|
| 83 |
+
|
| 84 |
+
/**
|
| 85 |
+
* Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
|
| 86 |
+
*/
|
| 87 |
+
#define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* Resource location flags- sysmem or vidmem
|
| 91 |
+
*
|
| 92 |
+
* For CUDA context on iGPU, since video and system memory are equivalent -
|
| 93 |
+
* these flags will not have an effect on the execution.
|
| 94 |
+
*
|
| 95 |
+
* For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
|
| 96 |
+
* to give a hint about the desired location.
|
| 97 |
+
*
|
| 98 |
+
* ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
|
| 99 |
+
* to be accessed by CUDA.
|
| 100 |
+
*
|
| 101 |
+
* ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
|
| 102 |
+
* video memory to be accessed by CUDA.
|
| 103 |
+
*
|
| 104 |
+
* There may be an additional latency due to new allocation and data migration,
|
| 105 |
+
* if the frame is produced on a different memory.
|
| 106 |
+
|
| 107 |
+
*/
|
| 108 |
+
typedef enum CUeglResourceLocationFlags_enum {
|
| 109 |
+
CU_EGL_RESOURCE_LOCATION_SYSMEM = 0x00, /**< Resource location sysmem */
|
| 110 |
+
CU_EGL_RESOURCE_LOCATION_VIDMEM = 0x01 /**< Resource location vidmem */
|
| 111 |
+
} CUeglResourceLocationFlags;
|
| 112 |
+
|
| 113 |
+
/**
|
| 114 |
+
* CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
|
| 115 |
+
* Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
|
| 116 |
+
*/
|
| 117 |
+
typedef enum CUeglColorFormat_enum {
|
| 118 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR = 0x00, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 119 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR = 0x01, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
|
| 120 |
+
CU_EGL_COLOR_FORMAT_YUV422_PLANAR = 0x02, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 121 |
+
CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR = 0x03, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
|
| 122 |
+
CU_EGL_COLOR_FORMAT_RGB = 0x04, /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
|
| 123 |
+
CU_EGL_COLOR_FORMAT_BGR = 0x05, /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
|
| 124 |
+
CU_EGL_COLOR_FORMAT_ARGB = 0x06, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
|
| 125 |
+
CU_EGL_COLOR_FORMAT_RGBA = 0x07, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
|
| 126 |
+
CU_EGL_COLOR_FORMAT_L = 0x08, /**< single luminance channel in one surface. */
|
| 127 |
+
CU_EGL_COLOR_FORMAT_R = 0x09, /**< single color channel in one surface. */
|
| 128 |
+
CU_EGL_COLOR_FORMAT_YUV444_PLANAR = 0x0A, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 129 |
+
CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR = 0x0B, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
|
| 130 |
+
CU_EGL_COLOR_FORMAT_YUYV_422 = 0x0C, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 131 |
+
CU_EGL_COLOR_FORMAT_UYVY_422 = 0x0D, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 132 |
+
CU_EGL_COLOR_FORMAT_ABGR = 0x0E, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
|
| 133 |
+
CU_EGL_COLOR_FORMAT_BGRA = 0x0F, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
|
| 134 |
+
CU_EGL_COLOR_FORMAT_A = 0x10, /**< Alpha color format - one channel in one surface. */
|
| 135 |
+
CU_EGL_COLOR_FORMAT_RG = 0x11, /**< R/G color format - two channels in one surface with GR byte ordering */
|
| 136 |
+
CU_EGL_COLOR_FORMAT_AYUV = 0x12, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 137 |
+
CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR = 0x13, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 138 |
+
CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR = 0x14, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 139 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR = 0x15, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 140 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR = 0x16, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 141 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR = 0x17, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 142 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR = 0x18, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 143 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR = 0x19, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 144 |
+
CU_EGL_COLOR_FORMAT_VYUY_ER = 0x1A, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 145 |
+
CU_EGL_COLOR_FORMAT_UYVY_ER = 0x1B, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 146 |
+
CU_EGL_COLOR_FORMAT_YUYV_ER = 0x1C, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 147 |
+
CU_EGL_COLOR_FORMAT_YVYU_ER = 0x1D, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 148 |
+
CU_EGL_COLOR_FORMAT_YUV_ER = 0x1E, /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
|
| 149 |
+
CU_EGL_COLOR_FORMAT_YUVA_ER = 0x1F, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 150 |
+
CU_EGL_COLOR_FORMAT_AYUV_ER = 0x20, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 151 |
+
CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER = 0x21, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 152 |
+
CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER = 0x22, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 153 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER = 0x23, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 154 |
+
CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER = 0x24, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 155 |
+
CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER = 0x25, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 156 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER = 0x26, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 157 |
+
CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER = 0x27, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 158 |
+
CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER = 0x28, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 159 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER = 0x29, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 160 |
+
CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER = 0x2A, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 161 |
+
CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER = 0x2B, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 162 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER = 0x2C, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 163 |
+
CU_EGL_COLOR_FORMAT_BAYER_RGGB = 0x2D, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
|
| 164 |
+
CU_EGL_COLOR_FORMAT_BAYER_BGGR = 0x2E, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
|
| 165 |
+
CU_EGL_COLOR_FORMAT_BAYER_GRBG = 0x2F, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
|
| 166 |
+
CU_EGL_COLOR_FORMAT_BAYER_GBRG = 0x30, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
|
| 167 |
+
CU_EGL_COLOR_FORMAT_BAYER10_RGGB = 0x31, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 168 |
+
CU_EGL_COLOR_FORMAT_BAYER10_BGGR = 0x32, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 169 |
+
CU_EGL_COLOR_FORMAT_BAYER10_GRBG = 0x33, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 170 |
+
CU_EGL_COLOR_FORMAT_BAYER10_GBRG = 0x34, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 171 |
+
CU_EGL_COLOR_FORMAT_BAYER12_RGGB = 0x35, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 172 |
+
CU_EGL_COLOR_FORMAT_BAYER12_BGGR = 0x36, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 173 |
+
CU_EGL_COLOR_FORMAT_BAYER12_GRBG = 0x37, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 174 |
+
CU_EGL_COLOR_FORMAT_BAYER12_GBRG = 0x38, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 175 |
+
CU_EGL_COLOR_FORMAT_BAYER14_RGGB = 0x39, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 176 |
+
CU_EGL_COLOR_FORMAT_BAYER14_BGGR = 0x3A, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 177 |
+
CU_EGL_COLOR_FORMAT_BAYER14_GRBG = 0x3B, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 178 |
+
CU_EGL_COLOR_FORMAT_BAYER14_GBRG = 0x3C, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 179 |
+
CU_EGL_COLOR_FORMAT_BAYER20_RGGB = 0x3D, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 180 |
+
CU_EGL_COLOR_FORMAT_BAYER20_BGGR = 0x3E, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 181 |
+
CU_EGL_COLOR_FORMAT_BAYER20_GRBG = 0x3F, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 182 |
+
CU_EGL_COLOR_FORMAT_BAYER20_GBRG = 0x40, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 183 |
+
CU_EGL_COLOR_FORMAT_YVU444_PLANAR = 0x41, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 184 |
+
CU_EGL_COLOR_FORMAT_YVU422_PLANAR = 0x42, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 185 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR = 0x43, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 186 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB = 0x44, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
|
| 187 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR = 0x45, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
|
| 188 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG = 0x46, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
|
| 189 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG = 0x47, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
|
| 190 |
+
CU_EGL_COLOR_FORMAT_BAYER_BCCR = 0x48, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
|
| 191 |
+
CU_EGL_COLOR_FORMAT_BAYER_RCCB = 0x49, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
|
| 192 |
+
CU_EGL_COLOR_FORMAT_BAYER_CRBC = 0x4A, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
|
| 193 |
+
CU_EGL_COLOR_FORMAT_BAYER_CBRC = 0x4B, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
|
| 194 |
+
CU_EGL_COLOR_FORMAT_BAYER10_CCCC = 0x4C, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 195 |
+
CU_EGL_COLOR_FORMAT_BAYER12_BCCR = 0x4D, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 196 |
+
CU_EGL_COLOR_FORMAT_BAYER12_RCCB = 0x4E, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 197 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CRBC = 0x4F, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 198 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CBRC = 0x50, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 199 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CCCC = 0x51, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 200 |
+
CU_EGL_COLOR_FORMAT_Y = 0x52, /**< Color format for single Y plane. */
|
| 201 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020 = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 202 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020 = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 203 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020 = 0x55, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */
|
| 204 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020 = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
|
| 205 |
+
= 1/2 Y height. */
|
| 206 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709 = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 207 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 208 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709 = 0x59, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height
|
| 209 |
+
= 1/2 Y height. */
|
| 210 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709 = 0x5A, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 211 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709 = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 212 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 213 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 214 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 215 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709 = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 216 |
+
CU_EGL_COLOR_FORMAT_Y_ER = 0x60, /**< Extended Range Color format for single Y plane. */
|
| 217 |
+
CU_EGL_COLOR_FORMAT_Y_709_ER = 0x61, /**< Extended Range Color format for single Y plane. */
|
| 218 |
+
CU_EGL_COLOR_FORMAT_Y10_ER = 0x62, /**< Extended Range Color format for single Y10 plane. */
|
| 219 |
+
CU_EGL_COLOR_FORMAT_Y10_709_ER = 0x63, /**< Extended Range Color format for single Y10 plane. */
|
| 220 |
+
CU_EGL_COLOR_FORMAT_Y12_ER = 0x64, /**< Extended Range Color format for single Y12 plane. */
|
| 221 |
+
CU_EGL_COLOR_FORMAT_Y12_709_ER = 0x65, /**< Extended Range Color format for single Y12 plane. */
|
| 222 |
+
CU_EGL_COLOR_FORMAT_YUVA = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 223 |
+
CU_EGL_COLOR_FORMAT_YUV = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
|
| 224 |
+
CU_EGL_COLOR_FORMAT_YVYU = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 225 |
+
CU_EGL_COLOR_FORMAT_VYUY = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 226 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 227 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 228 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 229 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 230 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 231 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 232 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 233 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 234 |
+
CU_EGL_COLOR_FORMAT_MAX
|
| 235 |
+
} CUeglColorFormat;
|
| 236 |
+
|
| 237 |
+
/**
|
| 238 |
+
* CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
|
| 239 |
+
*
|
| 240 |
+
* Each frame may contain one or more planes depending on whether the surface * is Multiplanar or not.
|
| 241 |
+
*/
|
| 242 |
+
typedef struct CUeglFrame_st {
|
| 243 |
+
union {
|
| 244 |
+
CUarray pArray[MAX_PLANES]; /**< Array of CUarray corresponding to each plane*/
|
| 245 |
+
void* pPitch[MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
|
| 246 |
+
} frame;
|
| 247 |
+
unsigned int width; /**< Width of first plane */
|
| 248 |
+
unsigned int height; /**< Height of first plane */
|
| 249 |
+
unsigned int depth; /**< Depth of first plane */
|
| 250 |
+
unsigned int pitch; /**< Pitch of first plane */
|
| 251 |
+
unsigned int planeCount; /**< Number of planes */
|
| 252 |
+
unsigned int numChannels; /**< Number of channels for the plane */
|
| 253 |
+
CUeglFrameType frameType; /**< Array or Pitch */
|
| 254 |
+
CUeglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
|
| 255 |
+
CUarray_format cuFormat; /**< CUDA Array Format*/
|
| 256 |
+
} CUeglFrame_v1;
|
| 257 |
+
typedef CUeglFrame_v1 CUeglFrame;
|
| 258 |
+
|
| 259 |
+
/**
|
| 260 |
+
* CUDA EGLSream Connection
|
| 261 |
+
*/
|
| 262 |
+
typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
|
| 263 |
+
|
| 264 |
+
/** @} */ /* END CUDA_TYPES */
|
| 265 |
+
|
| 266 |
+
/**
|
| 267 |
+
* \file cudaEGL.h
|
| 268 |
+
* \brief Header file for the EGL interoperability functions of the
|
| 269 |
+
* low-level CUDA driver application programming interface.
|
| 270 |
+
*/
|
| 271 |
+
|
| 272 |
+
/**
|
| 273 |
+
* \defgroup CUDA_EGL EGL Interoperability
|
| 274 |
+
* \ingroup CUDA_DRIVER
|
| 275 |
+
*
|
| 276 |
+
* ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
|
| 277 |
+
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 278 |
+
*
|
| 279 |
+
* This section describes the EGL interoperability functions of the
|
| 280 |
+
* low-level CUDA driver application programming interface.
|
| 281 |
+
*
|
| 282 |
+
* @{
|
| 283 |
+
*/
|
| 284 |
+
|
| 285 |
+
/**
|
| 286 |
+
* \brief Registers an EGL image
|
| 287 |
+
*
|
| 288 |
+
* Registers the EGLImageKHR specified by \p image for access by
|
| 289 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 290 |
+
* Additional Mapping/Unmapping is not required for the registered resource and
|
| 291 |
+
* ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
|
| 292 |
+
*
|
| 293 |
+
* The application will be responsible for synchronizing access to shared objects.
|
| 294 |
+
* The application must ensure that any pending operation which access the objects have completed
|
| 295 |
+
* before passing control to CUDA. This may be accomplished by issuing and waiting for
|
| 296 |
+
* glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
|
| 297 |
+
* The application will be also responsible for ensuring that any pending operation on the
|
| 298 |
+
* registered CUDA resource has completed prior to executing subsequent commands in other APIs
|
| 299 |
+
* accesing the same memory objects.
|
| 300 |
+
* This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
|
| 301 |
+
*
|
| 302 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 303 |
+
*
|
| 304 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 305 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 306 |
+
* read from and written to by CUDA. This is the default value.
|
| 307 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
|
| 308 |
+
* will not write to this resource.
|
| 309 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
|
| 310 |
+
* CUDA will not read from this resource and will write over the
|
| 311 |
+
* entire contents of the resource, so none of the data previously
|
| 312 |
+
* stored in the resource will be preserved.
|
| 313 |
+
*
|
| 314 |
+
* The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
|
| 315 |
+
* typedef void* EGLImageKHR
|
| 316 |
+
*
|
| 317 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 318 |
+
* \param image - An EGLImageKHR image which can be used to create target resource.
|
| 319 |
+
* \param flags - Map flags
|
| 320 |
+
*
|
| 321 |
+
* \return
|
| 322 |
+
* ::CUDA_SUCCESS,
|
| 323 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 324 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 325 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 326 |
+
*
|
| 327 |
+
* \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
|
| 328 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 329 |
+
* ::cuGraphicsUnmapResources,
|
| 330 |
+
* ::cudaGraphicsEGLRegisterImage
|
| 331 |
+
*/
|
| 332 |
+
CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 333 |
+
|
| 334 |
+
/**
|
| 335 |
+
* \brief Connect CUDA to EGLStream as a consumer.
|
| 336 |
+
*
|
| 337 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
|
| 338 |
+
*
|
| 339 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 340 |
+
* API to another.
|
| 341 |
+
*
|
| 342 |
+
* \param conn - Pointer to the returned connection handle
|
| 343 |
+
* \param stream - EGLStreamKHR handle
|
| 344 |
+
*
|
| 345 |
+
* \return
|
| 346 |
+
* ::CUDA_SUCCESS,
|
| 347 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 348 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 349 |
+
*
|
| 350 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 351 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 352 |
+
* ::cudaEGLStreamConsumerConnect
|
| 353 |
+
*/
|
| 354 |
+
CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
|
| 355 |
+
|
| 356 |
+
/**
|
| 357 |
+
* \brief Connect CUDA to EGLStream as a consumer with given flags.
|
| 358 |
+
*
|
| 359 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
|
| 360 |
+
*
|
| 361 |
+
* The flags specify whether the consumer wants to access frames from system memory or video memory.
|
| 362 |
+
* Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
|
| 363 |
+
*
|
| 364 |
+
* \param conn - Pointer to the returned connection handle
|
| 365 |
+
* \param stream - EGLStreamKHR handle
|
| 366 |
+
* \param flags - Flags denote intended location - system or video.
|
| 367 |
+
*
|
| 368 |
+
* \return
|
| 369 |
+
* ::CUDA_SUCCESS,
|
| 370 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 371 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 372 |
+
*
|
| 373 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 374 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 375 |
+
* ::cudaEGLStreamConsumerConnectWithFlags
|
| 376 |
+
*/
|
| 377 |
+
|
| 378 |
+
CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
|
| 379 |
+
|
| 380 |
+
/**
|
| 381 |
+
* \brief Disconnect CUDA as a consumer to EGLStream .
|
| 382 |
+
*
|
| 383 |
+
* Disconnect CUDA as a consumer to EGLStreamKHR.
|
| 384 |
+
*
|
| 385 |
+
* \param conn - Conection to disconnect.
|
| 386 |
+
*
|
| 387 |
+
* \return
|
| 388 |
+
* ::CUDA_SUCCESS,
|
| 389 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 390 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 391 |
+
*
|
| 392 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 393 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 394 |
+
* ::cudaEGLStreamConsumerDisconnect
|
| 395 |
+
*/
|
| 396 |
+
CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
|
| 397 |
+
|
| 398 |
+
/**
|
| 399 |
+
* \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
|
| 400 |
+
*
|
| 401 |
+
* Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
|
| 402 |
+
* by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
|
| 403 |
+
* during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
|
| 404 |
+
* ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
|
| 405 |
+
* ::CUeglFrame.
|
| 406 |
+
*
|
| 407 |
+
* \param conn - Connection on which to acquire
|
| 408 |
+
* \param pCudaResource - CUDA resource on which the stream frame will be mapped for use.
|
| 409 |
+
* \param pStream - CUDA stream for synchronization and any data migrations
|
| 410 |
+
* implied by ::CUeglResourceLocationFlags.
|
| 411 |
+
* \param timeout - Desired timeout in usec for a new frame to be acquired.
|
| 412 |
+
* If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
|
| 413 |
+
* After timeout occurs CUDA consumer tries to acquire an old frame
|
| 414 |
+
* if available and EGL_SUPPORT_REUSE_NV flag is set.
|
| 415 |
+
*
|
| 416 |
+
* \return
|
| 417 |
+
* ::CUDA_SUCCESS,
|
| 418 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 419 |
+
* ::CUDA_ERROR_LAUNCH_TIMEOUT,
|
| 420 |
+
*
|
| 421 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 422 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 423 |
+
* ::cudaEGLStreamConsumerAcquireFrame
|
| 424 |
+
*/
|
| 425 |
+
CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
|
| 426 |
+
CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
|
| 427 |
+
/**
|
| 428 |
+
* \brief Releases the last frame acquired from the EGLStream.
|
| 429 |
+
*
|
| 430 |
+
* Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
|
| 431 |
+
* If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
|
| 432 |
+
* this API doesn't release the last frame acquired on the EGLStream.
|
| 433 |
+
* By default, EGLStream is created with this flag set to EGL_TRUE.
|
| 434 |
+
*
|
| 435 |
+
* \param conn - Connection on which to release
|
| 436 |
+
* \param pCudaResource - CUDA resource whose corresponding frame is to be released
|
| 437 |
+
* \param pStream - CUDA stream on which release will be done.
|
| 438 |
+
*
|
| 439 |
+
* \return
|
| 440 |
+
* ::CUDA_SUCCESS,
|
| 441 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 442 |
+
*
|
| 443 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 444 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 445 |
+
* ::cudaEGLStreamConsumerReleaseFrame
|
| 446 |
+
*/
|
| 447 |
+
CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
|
| 448 |
+
CUgraphicsResource pCudaResource, CUstream *pStream);
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* \brief Connect CUDA to EGLStream as a producer.
|
| 452 |
+
*
|
| 453 |
+
* Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
|
| 454 |
+
*
|
| 455 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 456 |
+
* API to another.
|
| 457 |
+
*
|
| 458 |
+
* \param conn - Pointer to the returned connection handle
|
| 459 |
+
* \param stream - EGLStreamKHR handle
|
| 460 |
+
* \param width - width of the image to be submitted to the stream
|
| 461 |
+
* \param height - height of the image to be submitted to the stream
|
| 462 |
+
*
|
| 463 |
+
* \return
|
| 464 |
+
* ::CUDA_SUCCESS,
|
| 465 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 466 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 467 |
+
*
|
| 468 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 469 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 470 |
+
* ::cudaEGLStreamProducerConnect
|
| 471 |
+
*/
|
| 472 |
+
CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
|
| 473 |
+
EGLint width, EGLint height);
|
| 474 |
+
|
| 475 |
+
/**
|
| 476 |
+
* \brief Disconnect CUDA as a producer to EGLStream .
|
| 477 |
+
*
|
| 478 |
+
* Disconnect CUDA as a producer to EGLStreamKHR.
|
| 479 |
+
*
|
| 480 |
+
* \param conn - Conection to disconnect.
|
| 481 |
+
*
|
| 482 |
+
* \return
|
| 483 |
+
* ::CUDA_SUCCESS,
|
| 484 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 485 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 486 |
+
*
|
| 487 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 488 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 489 |
+
* ::cudaEGLStreamProducerDisconnect
|
| 490 |
+
*/
|
| 491 |
+
CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
|
| 492 |
+
|
| 493 |
+
/**
|
| 494 |
+
* \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
|
| 495 |
+
*
|
| 496 |
+
* When a frame is presented by the producer, it gets associated with the EGLStream
|
| 497 |
+
* and thus it is illegal to free the frame before the producer is disconnected.
|
| 498 |
+
* If a frame is freed and reused it may lead to undefined behavior.
|
| 499 |
+
*
|
| 500 |
+
* If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
|
| 501 |
+
* ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
|
| 502 |
+
* such cross-device applications.
|
| 503 |
+
*
|
| 504 |
+
* The ::CUeglFrame is defined as:
|
| 505 |
+
* \code
|
| 506 |
+
* typedef struct CUeglFrame_st {
|
| 507 |
+
* union {
|
| 508 |
+
* CUarray pArray[MAX_PLANES];
|
| 509 |
+
* void* pPitch[MAX_PLANES];
|
| 510 |
+
* } frame;
|
| 511 |
+
* unsigned int width;
|
| 512 |
+
* unsigned int height;
|
| 513 |
+
* unsigned int depth;
|
| 514 |
+
* unsigned int pitch;
|
| 515 |
+
* unsigned int planeCount;
|
| 516 |
+
* unsigned int numChannels;
|
| 517 |
+
* CUeglFrameType frameType;
|
| 518 |
+
* CUeglColorFormat eglColorFormat;
|
| 519 |
+
* CUarray_format cuFormat;
|
| 520 |
+
* } CUeglFrame;
|
| 521 |
+
* \endcode
|
| 522 |
+
*
|
| 523 |
+
* For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
|
| 524 |
+
* allocation. In that case, the pitched pointer will specify the start address of the sub-region in
|
| 525 |
+
* the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
|
| 526 |
+
*
|
| 527 |
+
* \param conn - Connection on which to present the CUDA array
|
| 528 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
|
| 529 |
+
* \param pStream - CUDA stream on which to present the frame.
|
| 530 |
+
*
|
| 531 |
+
* \return
|
| 532 |
+
* ::CUDA_SUCCESS,
|
| 533 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 534 |
+
*
|
| 535 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 536 |
+
* ::cuEGLStreamProducerReturnFrame,
|
| 537 |
+
* ::cudaEGLStreamProducerPresentFrame
|
| 538 |
+
*/
|
| 539 |
+
CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
|
| 540 |
+
CUeglFrame eglframe, CUstream *pStream);
|
| 541 |
+
|
| 542 |
+
/**
|
| 543 |
+
* \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
|
| 544 |
+
*
|
| 545 |
+
* This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not
|
| 546 |
+
* returned a frame to EGL stream. If timeout is returned the application can retry.
|
| 547 |
+
*
|
| 548 |
+
* \param conn - Connection on which to return
|
| 549 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
|
| 550 |
+
* \param pStream - CUDA stream on which to return the frame.
|
| 551 |
+
*
|
| 552 |
+
* \return
|
| 553 |
+
* ::CUDA_SUCCESS,
|
| 554 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 555 |
+
* ::CUDA_ERROR_LAUNCH_TIMEOUT
|
| 556 |
+
*
|
| 557 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 558 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 559 |
+
* ::cudaEGLStreamProducerReturnFrame
|
| 560 |
+
*/
|
| 561 |
+
CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
|
| 562 |
+
CUeglFrame *eglframe, CUstream *pStream);
|
| 563 |
+
|
| 564 |
+
/**
|
| 565 |
+
* \brief Get an eglFrame through which to access a registered EGL graphics resource.
|
| 566 |
+
*
|
| 567 |
+
* Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
|
| 568 |
+
* \p resource may be accessed.
|
| 569 |
+
* This API can only be called for registered EGL graphics resources.
|
| 570 |
+
*
|
| 571 |
+
* The ::CUeglFrame is defined as:
|
| 572 |
+
* \code
|
| 573 |
+
* typedef struct CUeglFrame_st {
|
| 574 |
+
* union {
|
| 575 |
+
* CUarray pArray[MAX_PLANES];
|
| 576 |
+
* void* pPitch[MAX_PLANES];
|
| 577 |
+
* } frame;
|
| 578 |
+
* unsigned int width;
|
| 579 |
+
* unsigned int height;
|
| 580 |
+
* unsigned int depth;
|
| 581 |
+
* unsigned int pitch;
|
| 582 |
+
* unsigned int planeCount;
|
| 583 |
+
* unsigned int numChannels;
|
| 584 |
+
* CUeglFrameType frameType;
|
| 585 |
+
* CUeglColorFormat eglColorFormat;
|
| 586 |
+
* CUarray_format cuFormat;
|
| 587 |
+
* } CUeglFrame;
|
| 588 |
+
* \endcode
|
| 589 |
+
*
|
| 590 |
+
* If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
|
| 591 |
+
* *
|
| 592 |
+
* \param eglFrame - Returned eglFrame.
|
| 593 |
+
* \param resource - Registered resource to access.
|
| 594 |
+
* \param index - Index for cubemap surfaces.
|
| 595 |
+
* \param mipLevel - Mipmap level for the subresource to access.
|
| 596 |
+
*
|
| 597 |
+
* \return
|
| 598 |
+
* ::CUDA_SUCCESS,
|
| 599 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 600 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 601 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 602 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 603 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 604 |
+
* ::CUDA_ERROR_NOT_MAPPED
|
| 605 |
+
*
|
| 606 |
+
* \sa
|
| 607 |
+
* ::cuGraphicsMapResources,
|
| 608 |
+
* ::cuGraphicsSubResourceGetMappedArray,
|
| 609 |
+
* ::cuGraphicsResourceGetMappedPointer,
|
| 610 |
+
* ::cudaGraphicsResourceGetMappedEglFrame
|
| 611 |
+
*/
|
| 612 |
+
CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
|
| 613 |
+
|
| 614 |
+
/**
|
| 615 |
+
* \brief Creates an event from EGLSync object
|
| 616 |
+
*
|
| 617 |
+
* Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
|
| 618 |
+
* via \p flags. Valid flags include:
|
| 619 |
+
* - ::CU_EVENT_DEFAULT: Default event creation flag.
|
| 620 |
+
* - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
|
| 621 |
+
* synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on
|
| 622 |
+
* an event created with this flag will block until the event has actually
|
| 623 |
+
* been completed.
|
| 624 |
+
*
|
| 625 |
+
* Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
|
| 626 |
+
* that can be invoked on the event.
|
| 627 |
+
*
|
| 628 |
+
* ::cuEventRecord and TimingData are not supported for events created from EGLSync.
|
| 629 |
+
*
|
| 630 |
+
* The EGLSyncKHR is an opaque handle to an EGL sync object.
|
| 631 |
+
* typedef void* EGLSyncKHR
|
| 632 |
+
*
|
| 633 |
+
* \param phEvent - Returns newly created event
|
| 634 |
+
* \param eglSync - Opaque handle to EGLSync object
|
| 635 |
+
* \param flags - Event creation flags
|
| 636 |
+
*
|
| 637 |
+
* \return
|
| 638 |
+
* ::CUDA_SUCCESS,
|
| 639 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 640 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 641 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 642 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 643 |
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
| 644 |
+
*
|
| 645 |
+
* \sa
|
| 646 |
+
* ::cuEventQuery,
|
| 647 |
+
* ::cuEventSynchronize,
|
| 648 |
+
* ::cuEventDestroy
|
| 649 |
+
*/
|
| 650 |
+
CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 651 |
+
|
| 652 |
+
/** @} */ /* END CUDA_EGL */
|
| 653 |
+
|
| 654 |
+
#ifdef __cplusplus
|
| 655 |
+
};
|
| 656 |
+
#endif
|
| 657 |
+
|
| 658 |
+
#endif
|
| 659 |
+
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAEGLTYPEDEFS_H
|
| 51 |
+
#define CUDAEGLTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
#include <cudaEGL.h>
|
| 54 |
+
|
| 55 |
+
#ifdef __cplusplus
|
| 56 |
+
extern "C" {
|
| 57 |
+
#endif // __cplusplus
|
| 58 |
+
|
| 59 |
+
/*
|
| 60 |
+
* Macros for the latest version for each driver function in cudaEGL.h
|
| 61 |
+
*/
|
| 62 |
+
#define PFN_cuGraphicsEGLRegisterImage PFN_cuGraphicsEGLRegisterImage_v7000
|
| 63 |
+
#define PFN_cuEGLStreamConsumerConnect PFN_cuEGLStreamConsumerConnect_v7000
|
| 64 |
+
#define PFN_cuEGLStreamConsumerConnectWithFlags PFN_cuEGLStreamConsumerConnectWithFlags_v8000
|
| 65 |
+
#define PFN_cuEGLStreamConsumerDisconnect PFN_cuEGLStreamConsumerDisconnect_v7000
|
| 66 |
+
#define PFN_cuEGLStreamConsumerAcquireFrame PFN_cuEGLStreamConsumerAcquireFrame_v7000
|
| 67 |
+
#define PFN_cuEGLStreamConsumerReleaseFrame PFN_cuEGLStreamConsumerReleaseFrame_v7000
|
| 68 |
+
#define PFN_cuEGLStreamProducerConnect PFN_cuEGLStreamProducerConnect_v7000
|
| 69 |
+
#define PFN_cuEGLStreamProducerDisconnect PFN_cuEGLStreamProducerDisconnect_v7000
|
| 70 |
+
#define PFN_cuEGLStreamProducerPresentFrame PFN_cuEGLStreamProducerPresentFrame_v7000
|
| 71 |
+
#define PFN_cuEGLStreamProducerReturnFrame PFN_cuEGLStreamProducerReturnFrame_v7000
|
| 72 |
+
#define PFN_cuGraphicsResourceGetMappedEglFrame PFN_cuGraphicsResourceGetMappedEglFrame_v7000
|
| 73 |
+
#define PFN_cuEventCreateFromEGLSync PFN_cuEventCreateFromEGLSync_v9000
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* Type definitions for functions defined in cudaEGL.h
|
| 78 |
+
*/
|
| 79 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 80 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
|
| 81 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
|
| 82 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
|
| 83 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
|
| 84 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
|
| 85 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
|
| 86 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
|
| 87 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
|
| 88 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
|
| 89 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
|
| 90 |
+
typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 91 |
+
|
| 92 |
+
#ifdef __cplusplus
|
| 93 |
+
}
|
| 94 |
+
#endif // __cplusplus
|
| 95 |
+
|
| 96 |
+
#endif // file guard
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGL.h
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAGL_H
|
| 51 |
+
#define CUDAGL_H
|
| 52 |
+
|
| 53 |
+
#include <cuda.h>
|
| 54 |
+
#include <GL/gl.h>
|
| 55 |
+
|
| 56 |
+
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 57 |
+
#define __CUDA_DEPRECATED
|
| 58 |
+
#elif defined(_MSC_VER)
|
| 59 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 60 |
+
#elif defined(__GNUC__)
|
| 61 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 62 |
+
#else
|
| 63 |
+
#define __CUDA_DEPRECATED
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#ifdef CUDA_FORCE_API_VERSION
|
| 67 |
+
#error "CUDA_FORCE_API_VERSION is no longer supported."
|
| 68 |
+
#endif
|
| 69 |
+
|
| 70 |
+
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 71 |
+
#define __CUDA_API_PER_THREAD_DEFAULT_STREAM
|
| 72 |
+
#define __CUDA_API_PTDS(api) api ## _ptds
|
| 73 |
+
#define __CUDA_API_PTSZ(api) api ## _ptsz
|
| 74 |
+
#else
|
| 75 |
+
#define __CUDA_API_PTDS(api) api
|
| 76 |
+
#define __CUDA_API_PTSZ(api) api
|
| 77 |
+
#endif
|
| 78 |
+
|
| 79 |
+
#define cuGLCtxCreate cuGLCtxCreate_v2
|
| 80 |
+
#define cuGLMapBufferObject __CUDA_API_PTDS(cuGLMapBufferObject_v2)
|
| 81 |
+
#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
|
| 82 |
+
#define cuGLGetDevices cuGLGetDevices_v2
|
| 83 |
+
|
| 84 |
+
#ifdef __cplusplus
|
| 85 |
+
extern "C" {
|
| 86 |
+
#endif
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* \file cudaGL.h
|
| 90 |
+
* \brief Header file for the OpenGL interoperability functions of the
|
| 91 |
+
* low-level CUDA driver application programming interface.
|
| 92 |
+
*/
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* \defgroup CUDA_GL OpenGL Interoperability
|
| 96 |
+
* \ingroup CUDA_DRIVER
|
| 97 |
+
*
|
| 98 |
+
* ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
|
| 99 |
+
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 100 |
+
*
|
| 101 |
+
* This section describes the OpenGL interoperability functions of the
|
| 102 |
+
* low-level CUDA driver application programming interface. Note that mapping
|
| 103 |
+
* of OpenGL resources is performed with the graphics API agnostic, resource
|
| 104 |
+
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
|
| 105 |
+
*
|
| 106 |
+
* @{
|
| 107 |
+
*/
|
| 108 |
+
|
| 109 |
+
#if defined(_WIN32)
|
| 110 |
+
#if !defined(WGL_NV_gpu_affinity)
|
| 111 |
+
typedef void* HGPUNV;
|
| 112 |
+
#endif
|
| 113 |
+
#endif /* _WIN32 */
|
| 114 |
+
|
| 115 |
+
/**
|
| 116 |
+
* \brief Registers an OpenGL buffer object
|
| 117 |
+
*
|
| 118 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 119 |
+
* CUDA. A handle to the registered object is returned as \p
|
| 120 |
+
* pCudaResource. The register flags \p Flags specify the intended usage,
|
| 121 |
+
* as follows:
|
| 122 |
+
*
|
| 123 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
|
| 124 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 125 |
+
* read from and written to by CUDA. This is the default value.
|
| 126 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
|
| 127 |
+
* will not write to this resource.
|
| 128 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
|
| 129 |
+
* CUDA will not read from this resource and will write over the
|
| 130 |
+
* entire contents of the resource, so none of the data previously
|
| 131 |
+
* stored in the resource will be preserved.
|
| 132 |
+
*
|
| 133 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 134 |
+
* \param buffer - name of buffer object to be registered
|
| 135 |
+
* \param Flags - Register flags
|
| 136 |
+
*
|
| 137 |
+
* \return
|
| 138 |
+
* ::CUDA_SUCCESS,
|
| 139 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 140 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 141 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 142 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 143 |
+
* \notefnerr
|
| 144 |
+
*
|
| 145 |
+
* \sa
|
| 146 |
+
* ::cuGraphicsUnregisterResource,
|
| 147 |
+
* ::cuGraphicsMapResources,
|
| 148 |
+
* ::cuGraphicsResourceGetMappedPointer,
|
| 149 |
+
* ::cudaGraphicsGLRegisterBuffer
|
| 150 |
+
*/
|
| 151 |
+
CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
|
| 152 |
+
|
| 153 |
+
/**
|
| 154 |
+
* \brief Register an OpenGL texture or renderbuffer object
|
| 155 |
+
*
|
| 156 |
+
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
|
| 157 |
+
* A handle to the registered object is returned as \p pCudaResource.
|
| 158 |
+
*
|
| 159 |
+
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
|
| 160 |
+
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
|
| 161 |
+
* or ::GL_RENDERBUFFER.
|
| 162 |
+
*
|
| 163 |
+
* The register flags \p Flags specify the intended usage, as follows:
|
| 164 |
+
*
|
| 165 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
|
| 166 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 167 |
+
* read from and written to by CUDA. This is the default value.
|
| 168 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
|
| 169 |
+
* will not write to this resource.
|
| 170 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
|
| 171 |
+
* CUDA will not read from this resource and will write over the
|
| 172 |
+
* entire contents of the resource, so none of the data previously
|
| 173 |
+
* stored in the resource will be preserved.
|
| 174 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
|
| 175 |
+
* bind this resource to a surface reference.
|
| 176 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
|
| 177 |
+
* texture gather operations on this resource.
|
| 178 |
+
*
|
| 179 |
+
* The following image formats are supported. For brevity's sake, the list is abbreviated.
|
| 180 |
+
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
|
| 181 |
+
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
|
| 182 |
+
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
|
| 183 |
+
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
|
| 184 |
+
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
|
| 185 |
+
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
|
| 186 |
+
*
|
| 187 |
+
* The following image classes are currently disallowed:
|
| 188 |
+
* - Textures with borders
|
| 189 |
+
* - Multisampled renderbuffers
|
| 190 |
+
*
|
| 191 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 192 |
+
* \param image - name of texture or renderbuffer object to be registered
|
| 193 |
+
* \param target - Identifies the type of object specified by \p image
|
| 194 |
+
* \param Flags - Register flags
|
| 195 |
+
*
|
| 196 |
+
* \return
|
| 197 |
+
* ::CUDA_SUCCESS,
|
| 198 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 199 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 200 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 201 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 202 |
+
* \notefnerr
|
| 203 |
+
*
|
| 204 |
+
* \sa
|
| 205 |
+
* ::cuGraphicsUnregisterResource,
|
| 206 |
+
* ::cuGraphicsMapResources,
|
| 207 |
+
* ::cuGraphicsSubResourceGetMappedArray,
|
| 208 |
+
* ::cudaGraphicsGLRegisterImage
|
| 209 |
+
*/
|
| 210 |
+
CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
|
| 211 |
+
|
| 212 |
+
#ifdef _WIN32
|
| 213 |
+
/**
|
| 214 |
+
* \brief Gets the CUDA device associated with hGpu
|
| 215 |
+
*
|
| 216 |
+
* Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
|
| 217 |
+
* applicable.
|
| 218 |
+
*
|
| 219 |
+
* \param pDevice - Device associated with hGpu
|
| 220 |
+
* \param hGpu - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
|
| 221 |
+
*
|
| 222 |
+
* \return
|
| 223 |
+
* ::CUDA_SUCCESS,
|
| 224 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 225 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 226 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 227 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 228 |
+
* \notefnerr
|
| 229 |
+
*
|
| 230 |
+
* \sa ::cuGLMapBufferObject,
|
| 231 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 232 |
+
* ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
|
| 233 |
+
* ::cuGLSetBufferObjectMapFlags,
|
| 234 |
+
* ::cudaWGLGetDevice
|
| 235 |
+
*/
|
| 236 |
+
CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
|
| 237 |
+
#endif /* _WIN32 */
|
| 238 |
+
|
| 239 |
+
/**
|
| 240 |
+
* CUDA devices corresponding to an OpenGL device
|
| 241 |
+
*/
|
| 242 |
+
typedef enum CUGLDeviceList_enum {
|
| 243 |
+
CU_GL_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
|
| 244 |
+
CU_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
|
| 245 |
+
CU_GL_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
|
| 246 |
+
} CUGLDeviceList;
|
| 247 |
+
|
| 248 |
+
/**
|
| 249 |
+
* \brief Gets the CUDA devices associated with the current OpenGL context
|
| 250 |
+
*
|
| 251 |
+
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
|
| 252 |
+
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
|
| 253 |
+
* at most cudaDeviceCount of the CUDA-compatible devices corresponding to
|
| 254 |
+
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
|
| 255 |
+
* context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
|
| 256 |
+
*
|
| 257 |
+
* The \p deviceList argument may be any of the following:
|
| 258 |
+
* - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
|
| 259 |
+
* - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
|
| 260 |
+
* render the current frame (in SLI).
|
| 261 |
+
* - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
|
| 262 |
+
* render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
|
| 263 |
+
* this is correct in all cases.
|
| 264 |
+
*
|
| 265 |
+
* \param pCudaDeviceCount - Returned number of CUDA devices.
|
| 266 |
+
* \param pCudaDevices - Returned CUDA devices.
|
| 267 |
+
* \param cudaDeviceCount - The size of the output device array pCudaDevices.
|
| 268 |
+
* \param deviceList - The set of devices to return.
|
| 269 |
+
*
|
| 270 |
+
* \return
|
| 271 |
+
* ::CUDA_SUCCESS,
|
| 272 |
+
* ::CUDA_ERROR_NO_DEVICE,
|
| 273 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 274 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 275 |
+
* ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
|
| 276 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 277 |
+
*
|
| 278 |
+
* \notefnerr
|
| 279 |
+
*
|
| 280 |
+
* \sa
|
| 281 |
+
* ::cuWGLGetDevice,
|
| 282 |
+
* ::cudaGLGetDevices
|
| 283 |
+
*/
|
| 284 |
+
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 285 |
+
|
| 286 |
+
/**
|
| 287 |
+
* \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
|
| 288 |
+
*
|
| 289 |
+
* ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
|
| 290 |
+
* CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 291 |
+
*
|
| 292 |
+
* This section describes deprecated OpenGL interoperability functionality.
|
| 293 |
+
*
|
| 294 |
+
* @{
|
| 295 |
+
*/
|
| 296 |
+
|
| 297 |
+
/** Flags to map or unmap a resource */
|
| 298 |
+
typedef enum CUGLmap_flags_enum {
|
| 299 |
+
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
|
| 300 |
+
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
|
| 301 |
+
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
|
| 302 |
+
} CUGLmap_flags;
|
| 303 |
+
|
| 304 |
+
/**
|
| 305 |
+
* \brief Create a CUDA context for interoperability with OpenGL
|
| 306 |
+
*
|
| 307 |
+
* \deprecated This function is deprecated as of Cuda 5.0.
|
| 308 |
+
*
|
| 309 |
+
* This function is deprecated and should no longer be used. It is
|
| 310 |
+
* no longer necessary to associate a CUDA context with an OpenGL
|
| 311 |
+
* context in order to achieve maximum interoperability performance.
|
| 312 |
+
*
|
| 313 |
+
* \param pCtx - Returned CUDA context
|
| 314 |
+
* \param Flags - Options for CUDA context creation
|
| 315 |
+
* \param device - Device on which to create the context
|
| 316 |
+
*
|
| 317 |
+
* \return
|
| 318 |
+
* ::CUDA_SUCCESS,
|
| 319 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 320 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 321 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 322 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 323 |
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
| 324 |
+
* \notefnerr
|
| 325 |
+
*
|
| 326 |
+
* \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
|
| 327 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 328 |
+
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
|
| 329 |
+
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
|
| 330 |
+
* ::cuWGLGetDevice
|
| 331 |
+
*/
|
| 332 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
|
| 333 |
+
|
| 334 |
+
/**
|
| 335 |
+
* \brief Initializes OpenGL interoperability
|
| 336 |
+
*
|
| 337 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 338 |
+
*
|
| 339 |
+
* Initializes OpenGL interoperability. This function is deprecated
|
| 340 |
+
* and calling it is no longer required. It may fail if the needed
|
| 341 |
+
* OpenGL driver facilities are not available.
|
| 342 |
+
*
|
| 343 |
+
* \return
|
| 344 |
+
* ::CUDA_SUCCESS,
|
| 345 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 346 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 347 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 348 |
+
* ::CUDA_ERROR_UNKNOWN
|
| 349 |
+
* \notefnerr
|
| 350 |
+
*
|
| 351 |
+
* \sa ::cuGLMapBufferObject,
|
| 352 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 353 |
+
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
|
| 354 |
+
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
|
| 355 |
+
* ::cuWGLGetDevice
|
| 356 |
+
*/
|
| 357 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
|
| 358 |
+
|
| 359 |
+
/**
|
| 360 |
+
* \brief Registers an OpenGL buffer object
|
| 361 |
+
*
|
| 362 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 363 |
+
*
|
| 364 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 365 |
+
* CUDA. This function must be called before CUDA can map the buffer
|
| 366 |
+
* object. There must be a valid OpenGL context bound to the current
|
| 367 |
+
* thread when this function is called, and the buffer name is
|
| 368 |
+
* resolved by that context.
|
| 369 |
+
*
|
| 370 |
+
* \param buffer - The name of the buffer object to register.
|
| 371 |
+
*
|
| 372 |
+
* \return
|
| 373 |
+
* ::CUDA_SUCCESS,
|
| 374 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 375 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 376 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 377 |
+
* ::CUDA_ERROR_ALREADY_MAPPED
|
| 378 |
+
* \notefnerr
|
| 379 |
+
*
|
| 380 |
+
* \sa ::cuGraphicsGLRegisterBuffer
|
| 381 |
+
*/
|
| 382 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
|
| 383 |
+
|
| 384 |
+
/**
|
| 385 |
+
* \brief Maps an OpenGL buffer object
|
| 386 |
+
*
|
| 387 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 388 |
+
*
|
| 389 |
+
* Maps the buffer object specified by \p buffer into the address space of the
|
| 390 |
+
* current CUDA context and returns in \p *dptr and \p *size the base pointer
|
| 391 |
+
* and size of the resulting mapping.
|
| 392 |
+
*
|
| 393 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 394 |
+
* when this function is called. This must be the same context, or a
|
| 395 |
+
* member of the same shareGroup, as the context that was bound when
|
| 396 |
+
* the buffer was registered.
|
| 397 |
+
*
|
| 398 |
+
* All streams in the current CUDA context are synchronized with the
|
| 399 |
+
* current GL context.
|
| 400 |
+
*
|
| 401 |
+
* \param dptr - Returned mapped base pointer
|
| 402 |
+
* \param size - Returned size of mapping
|
| 403 |
+
* \param buffer - The name of the buffer object to map
|
| 404 |
+
*
|
| 405 |
+
* \return
|
| 406 |
+
* ::CUDA_SUCCESS,
|
| 407 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 408 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 409 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 410 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 411 |
+
* ::CUDA_ERROR_MAP_FAILED
|
| 412 |
+
* \notefnerr
|
| 413 |
+
*
|
| 414 |
+
* \sa ::cuGraphicsMapResources
|
| 415 |
+
*/
|
| 416 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size, GLuint buffer);
|
| 417 |
+
|
| 418 |
+
/**
|
| 419 |
+
* \brief Unmaps an OpenGL buffer object
|
| 420 |
+
*
|
| 421 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 422 |
+
*
|
| 423 |
+
* Unmaps the buffer object specified by \p buffer for access by CUDA.
|
| 424 |
+
*
|
| 425 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 426 |
+
* when this function is called. This must be the same context, or a
|
| 427 |
+
* member of the same shareGroup, as the context that was bound when
|
| 428 |
+
* the buffer was registered.
|
| 429 |
+
*
|
| 430 |
+
* All streams in the current CUDA context are synchronized with the
|
| 431 |
+
* current GL context.
|
| 432 |
+
*
|
| 433 |
+
* \param buffer - Buffer object to unmap
|
| 434 |
+
*
|
| 435 |
+
* \return
|
| 436 |
+
* ::CUDA_SUCCESS,
|
| 437 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 438 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 439 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 440 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 441 |
+
* \notefnerr
|
| 442 |
+
*
|
| 443 |
+
* \sa ::cuGraphicsUnmapResources
|
| 444 |
+
*/
|
| 445 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
|
| 446 |
+
|
| 447 |
+
/**
|
| 448 |
+
* \brief Unregister an OpenGL buffer object
|
| 449 |
+
*
|
| 450 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 451 |
+
*
|
| 452 |
+
* Unregisters the buffer object specified by \p buffer. This
|
| 453 |
+
* releases any resources associated with the registered buffer.
|
| 454 |
+
* After this call, the buffer may no longer be mapped for access by
|
| 455 |
+
* CUDA.
|
| 456 |
+
*
|
| 457 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 458 |
+
* when this function is called. This must be the same context, or a
|
| 459 |
+
* member of the same shareGroup, as the context that was bound when
|
| 460 |
+
* the buffer was registered.
|
| 461 |
+
*
|
| 462 |
+
* \param buffer - Name of the buffer object to unregister
|
| 463 |
+
*
|
| 464 |
+
* \return
|
| 465 |
+
* ::CUDA_SUCCESS,
|
| 466 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 467 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 468 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 469 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 470 |
+
* \notefnerr
|
| 471 |
+
*
|
| 472 |
+
* \sa ::cuGraphicsUnregisterResource
|
| 473 |
+
*/
|
| 474 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
|
| 475 |
+
|
| 476 |
+
/**
|
| 477 |
+
* \brief Set the map flags for an OpenGL buffer object
|
| 478 |
+
*
|
| 479 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 480 |
+
*
|
| 481 |
+
* Sets the map flags for the buffer object specified by \p buffer.
|
| 482 |
+
*
|
| 483 |
+
* Changes to \p Flags will take effect the next time \p buffer is mapped.
|
| 484 |
+
* The \p Flags argument may be any of the following:
|
| 485 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 486 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 487 |
+
* read from and written to by CUDA kernels. This is the default value.
|
| 488 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
|
| 489 |
+
* access this resource will not write to this resource.
|
| 490 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
|
| 491 |
+
* which access this resource will not read from this resource and will
|
| 492 |
+
* write over the entire contents of the resource, so none of the data
|
| 493 |
+
* previously stored in the resource will be preserved.
|
| 494 |
+
*
|
| 495 |
+
* If \p buffer has not been registered for use with CUDA, then
|
| 496 |
+
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
|
| 497 |
+
* mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
|
| 498 |
+
*
|
| 499 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 500 |
+
* when this function is called. This must be the same context, or a
|
| 501 |
+
* member of the same shareGroup, as the context that was bound when
|
| 502 |
+
* the buffer was registered.
|
| 503 |
+
*
|
| 504 |
+
* \param buffer - Buffer object to unmap
|
| 505 |
+
* \param Flags - Map flags
|
| 506 |
+
*
|
| 507 |
+
* \return
|
| 508 |
+
* ::CUDA_SUCCESS,
|
| 509 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 510 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 511 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 512 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 513 |
+
* \notefnerr
|
| 514 |
+
*
|
| 515 |
+
* \sa ::cuGraphicsResourceSetMapFlags
|
| 516 |
+
*/
|
| 517 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
|
| 518 |
+
|
| 519 |
+
/**
|
| 520 |
+
* \brief Maps an OpenGL buffer object
|
| 521 |
+
*
|
| 522 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 523 |
+
*
|
| 524 |
+
* Maps the buffer object specified by \p buffer into the address space of the
|
| 525 |
+
* current CUDA context and returns in \p *dptr and \p *size the base pointer
|
| 526 |
+
* and size of the resulting mapping.
|
| 527 |
+
*
|
| 528 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 529 |
+
* when this function is called. This must be the same context, or a
|
| 530 |
+
* member of the same shareGroup, as the context that was bound when
|
| 531 |
+
* the buffer was registered.
|
| 532 |
+
*
|
| 533 |
+
* Stream \p hStream in the current CUDA context is synchronized with
|
| 534 |
+
* the current GL context.
|
| 535 |
+
*
|
| 536 |
+
* \param dptr - Returned mapped base pointer
|
| 537 |
+
* \param size - Returned size of mapping
|
| 538 |
+
* \param buffer - The name of the buffer object to map
|
| 539 |
+
* \param hStream - Stream to synchronize
|
| 540 |
+
*
|
| 541 |
+
* \return
|
| 542 |
+
* ::CUDA_SUCCESS,
|
| 543 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 544 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 545 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 546 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 547 |
+
* ::CUDA_ERROR_MAP_FAILED
|
| 548 |
+
* \notefnerr
|
| 549 |
+
*
|
| 550 |
+
* \sa ::cuGraphicsMapResources
|
| 551 |
+
*/
|
| 552 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 553 |
+
|
| 554 |
+
/**
|
| 555 |
+
* \brief Unmaps an OpenGL buffer object
|
| 556 |
+
*
|
| 557 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 558 |
+
*
|
| 559 |
+
* Unmaps the buffer object specified by \p buffer for access by CUDA.
|
| 560 |
+
*
|
| 561 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 562 |
+
* when this function is called. This must be the same context, or a
|
| 563 |
+
* member of the same shareGroup, as the context that was bound when
|
| 564 |
+
* the buffer was registered.
|
| 565 |
+
*
|
| 566 |
+
* Stream \p hStream in the current CUDA context is synchronized with
|
| 567 |
+
* the current GL context.
|
| 568 |
+
*
|
| 569 |
+
* \param buffer - Name of the buffer object to unmap
|
| 570 |
+
* \param hStream - Stream to synchronize
|
| 571 |
+
*
|
| 572 |
+
* \return
|
| 573 |
+
* ::CUDA_SUCCESS,
|
| 574 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 575 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 576 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 577 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 578 |
+
* \notefnerr
|
| 579 |
+
*
|
| 580 |
+
* \sa ::cuGraphicsUnmapResources
|
| 581 |
+
*/
|
| 582 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
|
| 583 |
+
|
| 584 |
+
/** @} */ /* END CUDA_GL_DEPRECATED */
|
| 585 |
+
/** @} */ /* END CUDA_GL */
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 589 |
+
#undef cuGLCtxCreate
|
| 590 |
+
#undef cuGLMapBufferObject
|
| 591 |
+
#undef cuGLMapBufferObjectAsync
|
| 592 |
+
#undef cuGLGetDevices
|
| 593 |
+
|
| 594 |
+
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 595 |
+
CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer);
|
| 596 |
+
CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 597 |
+
CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
|
| 598 |
+
CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
|
| 599 |
+
CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
|
| 600 |
+
#endif /* __CUDA_API_VERSION_INTERNAL */
|
| 601 |
+
|
| 602 |
+
#ifdef __cplusplus
|
| 603 |
+
};
|
| 604 |
+
#endif
|
| 605 |
+
|
| 606 |
+
#undef __CUDA_DEPRECATED
|
| 607 |
+
|
| 608 |
+
#endif
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAGLTYPEDEFS_H
|
| 51 |
+
#define CUDAGLTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
// Dependent includes for cudagl.h
|
| 54 |
+
#include <GL/gl.h>
|
| 55 |
+
|
| 56 |
+
#include <cudaGL.h>
|
| 57 |
+
|
| 58 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 59 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
|
| 60 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
|
| 61 |
+
#else
|
| 62 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
|
| 63 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#ifdef __cplusplus
|
| 67 |
+
extern "C" {
|
| 68 |
+
#endif // __cplusplus
|
| 69 |
+
|
| 70 |
+
/*
|
| 71 |
+
* Macros for the latest version for each driver function in cudaGL.h
|
| 72 |
+
*/
|
| 73 |
+
#define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
|
| 74 |
+
#define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
|
| 75 |
+
#define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
|
| 76 |
+
#define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
|
| 77 |
+
#define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
|
| 78 |
+
#define PFN_cuGLInit PFN_cuGLInit_v2000
|
| 79 |
+
#define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
|
| 80 |
+
#define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
|
| 81 |
+
#define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
|
| 82 |
+
#define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
|
| 83 |
+
#define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
|
| 84 |
+
#define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
|
| 85 |
+
#define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* Type definitions for functions defined in cudaGL.h
|
| 90 |
+
*/
|
| 91 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
|
| 92 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
|
| 93 |
+
#ifdef _WIN32
|
| 94 |
+
typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
|
| 95 |
+
#endif
|
| 96 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 97 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 98 |
+
typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
|
| 99 |
+
typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
|
| 100 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 101 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
|
| 102 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
|
| 103 |
+
typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
|
| 104 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 105 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
|
| 106 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 107 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 108 |
+
|
| 109 |
+
/*
|
| 110 |
+
* Type definitions for older versioned functions in cuda.h
|
| 111 |
+
*/
|
| 112 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 113 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 114 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
|
| 115 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
|
| 116 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 117 |
+
#endif
|
| 118 |
+
|
| 119 |
+
#ifdef __cplusplus
|
| 120 |
+
}
|
| 121 |
+
#endif // __cplusplus
|
| 122 |
+
|
| 123 |
+
#endif // file guard
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAVDPAUTYPEDEFS_H
|
| 51 |
+
#define CUDAVDPAUTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
// Dependent includes for cudavdpau.h
|
| 54 |
+
#include <vdpau/vdpau.h>
|
| 55 |
+
|
| 56 |
+
#include <cudaVDPAU.h>
|
| 57 |
+
|
| 58 |
+
#ifdef __cplusplus
|
| 59 |
+
extern "C" {
|
| 60 |
+
#endif // __cplusplus
|
| 61 |
+
|
| 62 |
+
/*
|
| 63 |
+
* Macros for the latest version for each driver function in cudaVDPAU.h
|
| 64 |
+
*/
|
| 65 |
+
#define PFN_cuVDPAUGetDevice PFN_cuVDPAUGetDevice_v3010
|
| 66 |
+
#define PFN_cuVDPAUCtxCreate PFN_cuVDPAUCtxCreate_v3020
|
| 67 |
+
#define PFN_cuGraphicsVDPAURegisterVideoSurface PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
|
| 68 |
+
#define PFN_cuGraphicsVDPAURegisterOutputSurface PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Type definitions for functions defined in cudaVDPAU.h
|
| 73 |
+
*/
|
| 74 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 75 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 76 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
|
| 77 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
|
| 78 |
+
|
| 79 |
+
/*
|
| 80 |
+
* Type definitions for older versioned functions in cudaVDPAU.h
|
| 81 |
+
*/
|
| 82 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 83 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
#ifdef __cplusplus
|
| 87 |
+
}
|
| 88 |
+
#endif // __cplusplus
|
| 89 |
+
|
| 90 |
+
#endif // file guard
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_H_
|
| 51 |
+
# define _CUDA_AWBARRIER_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_awbarrier_primitives.h"
|
| 54 |
+
|
| 55 |
+
# if !defined(_CUDA_AWBARRIER_SM_TARGET)
|
| 56 |
+
# error This file requires compute capability 7.0 or greater.
|
| 57 |
+
# endif
|
| 58 |
+
|
| 59 |
+
# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
|
| 60 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 61 |
+
-std=c++11 compiler option.
|
| 62 |
+
# endif
|
| 63 |
+
|
| 64 |
+
_CUDA_AWBARRIER_BEGIN_NAMESPACE
|
| 65 |
+
|
| 66 |
+
class awbarrier {
|
| 67 |
+
public:
|
| 68 |
+
class arrival_token {
|
| 69 |
+
public:
|
| 70 |
+
arrival_token() = default;
|
| 71 |
+
~arrival_token() = default;
|
| 72 |
+
_CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
|
| 73 |
+
private:
|
| 74 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
|
| 75 |
+
uint64_t token;
|
| 76 |
+
friend awbarrier;
|
| 77 |
+
};
|
| 78 |
+
awbarrier() = default;
|
| 79 |
+
awbarrier(const awbarrier&) = delete;
|
| 80 |
+
awbarrier& operator=(const awbarrier&) = delete;
|
| 81 |
+
~awbarrier() = default;
|
| 82 |
+
|
| 83 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
|
| 84 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
|
| 85 |
+
_CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
|
| 86 |
+
_CUDA_AWBARRIER_QUALIFIER bool timed_wait_parity(bool phase, uint32_t hint_cycles);
|
| 87 |
+
_CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
|
| 88 |
+
_CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
|
| 89 |
+
_CUDA_AWBARRIER_QUALIFIER bool try_wait(arrival_token token, uint32_t maxSleepNanosec);
|
| 90 |
+
_CUDA_AWBARRIER_QUALIFIER bool try_wait_parity(bool phase, uint32_t maxSleepNanosec);
|
| 91 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
|
| 92 |
+
|
| 93 |
+
private:
|
| 94 |
+
uint64_t barrier;
|
| 95 |
+
friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
|
| 96 |
+
friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
|
| 97 |
+
friend class pipeline;
|
| 98 |
+
};
|
| 99 |
+
|
| 100 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 101 |
+
uint32_t awbarrier::arrival_token::pending_count() const
|
| 102 |
+
{
|
| 103 |
+
const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
|
| 104 |
+
#if (__CUDA_ARCH__ >= 900)
|
| 105 |
+
return pending_count;
|
| 106 |
+
#else
|
| 107 |
+
return (pending_count >> 15);
|
| 108 |
+
#endif
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 112 |
+
awbarrier::arrival_token::arrival_token(uint64_t token)
|
| 113 |
+
: token(token)
|
| 114 |
+
{
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 118 |
+
void init(awbarrier* barrier, uint32_t expected_count)
|
| 119 |
+
{
|
| 120 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 121 |
+
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
|
| 122 |
+
|
| 123 |
+
#if (__CUDA_ARCH__ >= 900)
|
| 124 |
+
const uint32_t init_count = expected_count;
|
| 125 |
+
#else
|
| 126 |
+
const uint32_t init_count = (expected_count << 15) + expected_count;
|
| 127 |
+
#endif
|
| 128 |
+
|
| 129 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 133 |
+
void inval(awbarrier* barrier)
|
| 134 |
+
{
|
| 135 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 136 |
+
|
| 137 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 141 |
+
awbarrier::arrival_token awbarrier::arrive()
|
| 142 |
+
{
|
| 143 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 144 |
+
|
| 145 |
+
#if (__CUDA_ARCH__ < 900)
|
| 146 |
+
const uint32_t arrive_count = 1 << 15;
|
| 147 |
+
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
|
| 148 |
+
(void)
|
| 149 |
+
#else
|
| 150 |
+
const uint64_t token =
|
| 151 |
+
#endif
|
| 152 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
|
| 153 |
+
|
| 154 |
+
return arrival_token(token);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 158 |
+
awbarrier::arrival_token awbarrier::arrive_and_drop()
|
| 159 |
+
{
|
| 160 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 161 |
+
|
| 162 |
+
#if (__CUDA_ARCH__ < 900)
|
| 163 |
+
const uint32_t arrive_count = 1 << 15;
|
| 164 |
+
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
|
| 165 |
+
(void)
|
| 166 |
+
#else
|
| 167 |
+
const uint64_t token =
|
| 168 |
+
#endif
|
| 169 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
|
| 170 |
+
|
| 171 |
+
return arrival_token(token);
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 175 |
+
bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
|
| 176 |
+
{
|
| 177 |
+
constexpr uint64_t max_busy_wait_cycles = 1024;
|
| 178 |
+
constexpr uint32_t max_sleep_ns = 1 << 20;
|
| 179 |
+
|
| 180 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 181 |
+
|
| 182 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
|
| 183 |
+
return true;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
uint64_t start_cycles = clock64();
|
| 187 |
+
uint64_t elapsed_cycles = 0;
|
| 188 |
+
uint32_t sleep_ns = 32;
|
| 189 |
+
while (elapsed_cycles < hint_cycles) {
|
| 190 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
|
| 191 |
+
return true;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
if (elapsed_cycles > max_busy_wait_cycles) {
|
| 195 |
+
__nanosleep(sleep_ns);
|
| 196 |
+
if (sleep_ns < max_sleep_ns) {
|
| 197 |
+
sleep_ns *= 2;
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
elapsed_cycles = clock64() - start_cycles;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
return false;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 208 |
+
bool awbarrier::timed_wait_parity(bool phase, uint32_t hint_cycles)
|
| 209 |
+
{
|
| 210 |
+
constexpr uint64_t max_busy_wait_cycles = 1024;
|
| 211 |
+
constexpr uint32_t max_sleep_ns = 1 << 20;
|
| 212 |
+
|
| 213 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 214 |
+
|
| 215 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
|
| 216 |
+
return true;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
uint64_t start_cycles = clock64();
|
| 220 |
+
uint64_t elapsed_cycles = 0;
|
| 221 |
+
uint32_t sleep_ns = 32;
|
| 222 |
+
while (elapsed_cycles < hint_cycles) {
|
| 223 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
|
| 224 |
+
return true;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
if (elapsed_cycles > max_busy_wait_cycles) {
|
| 228 |
+
__nanosleep(sleep_ns);
|
| 229 |
+
if (sleep_ns < max_sleep_ns) {
|
| 230 |
+
sleep_ns *= 2;
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
elapsed_cycles = clock64() - start_cycles;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
return false;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 241 |
+
bool awbarrier::try_wait(arrival_token token, uint32_t maxSleepNanosec)
|
| 242 |
+
{
|
| 243 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 244 |
+
|
| 245 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait(&this->barrier, token.token, maxSleepNanosec);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 249 |
+
bool awbarrier::try_wait_parity(bool phase, uint32_t maxSleepNanosec)
|
| 250 |
+
{
|
| 251 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 252 |
+
|
| 253 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait_parity(&this->barrier, phase, maxSleepNanosec);
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 257 |
+
void awbarrier::wait(arrival_token token)
|
| 258 |
+
{
|
| 259 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 260 |
+
|
| 261 |
+
while (!timed_wait(token, ~0u));
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 265 |
+
void awbarrier::arrive_and_wait()
|
| 266 |
+
{
|
| 267 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 268 |
+
|
| 269 |
+
this->wait(this->arrive());
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
_CUDA_AWBARRIER_QUALIFIER __host__
|
| 273 |
+
constexpr uint32_t awbarrier::max()
|
| 274 |
+
{
|
| 275 |
+
return _CUDA_AWBARRIER_MAX_COUNT;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
_CUDA_AWBARRIER_END_NAMESPACE
|
| 279 |
+
|
| 280 |
+
#endif /* !_CUDA_AWBARRIER_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_HELPERS_H_
|
| 51 |
+
#define _CUDA_AWBARRIER_HELPERS_H_
|
| 52 |
+
|
| 53 |
+
#define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental
|
| 54 |
+
#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
|
| 55 |
+
#define _CUDA_AWBARRIER_END_NAMESPACE } }
|
| 56 |
+
|
| 57 |
+
#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
|
| 58 |
+
#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
|
| 59 |
+
#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE
|
| 60 |
+
|
| 61 |
+
# if !defined(_CUDA_AWBARRIER_QUALIFIER)
|
| 62 |
+
# define _CUDA_AWBARRIER_QUALIFIER inline __device__
|
| 63 |
+
# endif
|
| 64 |
+
# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
|
| 65 |
+
# define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
|
| 66 |
+
#endif
|
| 67 |
+
|
| 68 |
+
#if defined(__CUDA_ARCH__)
|
| 69 |
+
#if (__CUDA_ARCH__ >= 900)
|
| 70 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_90
|
| 71 |
+
#elif (__CUDA_ARCH__ >= 800)
|
| 72 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
|
| 73 |
+
#elif (__CUDA_ARCH__ >= 700)
|
| 74 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
|
| 75 |
+
#endif
|
| 76 |
+
#else
|
| 77 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
|
| 81 |
+
|
| 82 |
+
#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
|
| 83 |
+
# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
#if !defined(_CUDA_AWBARRIER_DEBUG)
|
| 87 |
+
# if defined(__CUDACC_DEBUG__)
|
| 88 |
+
# define _CUDA_AWBARRIER_DEBUG 1
|
| 89 |
+
# else
|
| 90 |
+
# define _CUDA_AWBARRIER_DEBUG 0
|
| 91 |
+
# endif
|
| 92 |
+
#endif
|
| 93 |
+
|
| 94 |
+
#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
|
| 95 |
+
# if !defined(__CUDACC_RTC__)
|
| 96 |
+
# include <cassert>
|
| 97 |
+
# endif
|
| 98 |
+
# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
|
| 99 |
+
# define _CUDA_AWBARRIER_ABORT() assert(0);
|
| 100 |
+
#else
|
| 101 |
+
# define _CUDA_AWBARRIER_ASSERT(x)
|
| 102 |
+
# define _CUDA_AWBARRIER_ABORT() __trap();
|
| 103 |
+
#endif
|
| 104 |
+
|
| 105 |
+
#if defined(__CUDACC_RTC__)
|
| 106 |
+
typedef unsigned short uint16_t;
|
| 107 |
+
typedef unsigned int uint32_t;
|
| 108 |
+
typedef unsigned long long uint64_t;
|
| 109 |
+
typedef uint64_t uintptr_t;
|
| 110 |
+
#else
|
| 111 |
+
# include <stdint.h>
|
| 112 |
+
#endif
|
| 113 |
+
|
| 114 |
+
// implicitly provided by NVRTC
|
| 115 |
+
#ifndef __CUDACC_RTC__
|
| 116 |
+
#include <nv/target>
|
| 117 |
+
#endif /* !defined(__CUDACC_RTC__) */
|
| 118 |
+
|
| 119 |
+
typedef uint64_t __mbarrier_t;
|
| 120 |
+
typedef uint64_t __mbarrier_token_t;
|
| 121 |
+
|
| 122 |
+
_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
|
| 123 |
+
|
| 124 |
+
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
|
| 125 |
+
|
| 126 |
+
union AWBarrier {
|
| 127 |
+
struct {
|
| 128 |
+
uint32_t expected;
|
| 129 |
+
uint32_t pending;
|
| 130 |
+
} split;
|
| 131 |
+
uint64_t raw;
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 135 |
+
void awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
|
| 136 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 137 |
+
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
|
| 138 |
+
|
| 139 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 140 |
+
asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
|
| 141 |
+
:
|
| 142 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
|
| 143 |
+
: "memory");
|
| 144 |
+
return;
|
| 145 |
+
)
|
| 146 |
+
NV_IF_TARGET(NV_PROVIDES_SM_70,
|
| 147 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 148 |
+
|
| 149 |
+
awbarrier->split.expected = 0x40000000 - expected_count;
|
| 150 |
+
awbarrier->split.pending = 0x80000000 - expected_count;
|
| 151 |
+
return;
|
| 152 |
+
)
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 156 |
+
void awbarrier_inval(uint64_t* barrier) {
|
| 157 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 158 |
+
|
| 159 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 160 |
+
asm volatile ("mbarrier.inval.shared.b64 [%0];"
|
| 161 |
+
:
|
| 162 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 163 |
+
: "memory");
|
| 164 |
+
return;
|
| 165 |
+
)
|
| 166 |
+
return;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 170 |
+
uint32_t awbarrier_token_pending_count(uint64_t token) {
|
| 171 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 172 |
+
uint32_t __pending_count;
|
| 173 |
+
|
| 174 |
+
asm ("mbarrier.pending_count.b64 %0, %1;"
|
| 175 |
+
: "=r"(__pending_count)
|
| 176 |
+
: "l"(token));
|
| 177 |
+
return __pending_count;
|
| 178 |
+
)
|
| 179 |
+
NV_IF_TARGET(NV_PROVIDES_SM_70,
|
| 180 |
+
const uint32_t pending = token >> 32;
|
| 181 |
+
return 0x80000000 - (pending & 0x7fffffff);
|
| 182 |
+
)
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
template<bool _Drop>
|
| 186 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 187 |
+
uint64_t awbarrier_arrive_drop(uint64_t* barrier) {
|
| 188 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 189 |
+
|
| 190 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 191 |
+
uint64_t token;
|
| 192 |
+
|
| 193 |
+
if (_Drop) {
|
| 194 |
+
asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
|
| 195 |
+
: "=l"(token)
|
| 196 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 197 |
+
: "memory");
|
| 198 |
+
} else {
|
| 199 |
+
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
|
| 200 |
+
: "=l"(token)
|
| 201 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 202 |
+
: "memory");
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
return token;
|
| 206 |
+
)
|
| 207 |
+
NV_IF_TARGET(NV_PROVIDES_SM_70,
|
| 208 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 209 |
+
|
| 210 |
+
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
|
| 211 |
+
|
| 212 |
+
if (_Drop) {
|
| 213 |
+
(void)atomicAdd_block(&awbarrier->split.expected, 1);
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
__threadfence_block();
|
| 217 |
+
|
| 218 |
+
const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
|
| 219 |
+
const uint32_t new_pending = old_pending + 1;
|
| 220 |
+
const bool reset = (old_pending ^ new_pending) & 0x80000000;
|
| 221 |
+
|
| 222 |
+
if (reset) {
|
| 223 |
+
__threadfence_block();
|
| 224 |
+
|
| 225 |
+
uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
|
| 226 |
+
new_expected &= ~0x40000000;
|
| 227 |
+
if (new_expected & 0x20000000) {
|
| 228 |
+
new_expected |= 0x40000000;
|
| 229 |
+
}
|
| 230 |
+
atomicAdd_block(&awbarrier->split.pending, new_expected);
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
return static_cast<uint64_t>(old_pending) << 32;
|
| 234 |
+
)
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
template<bool _Drop>
|
| 238 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 239 |
+
uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
|
| 240 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 241 |
+
_CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
|
| 242 |
+
|
| 243 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 244 |
+
uint64_t token;
|
| 245 |
+
|
| 246 |
+
if (_Drop) {
|
| 247 |
+
asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
|
| 248 |
+
: "=l"(token)
|
| 249 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
|
| 250 |
+
: "memory");
|
| 251 |
+
} else {
|
| 252 |
+
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
|
| 253 |
+
: "=l"(token)
|
| 254 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
|
| 255 |
+
: "memory");
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
return token;
|
| 259 |
+
)
|
| 260 |
+
NV_IF_TARGET(NV_PROVIDES_SM_70,
|
| 261 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 262 |
+
|
| 263 |
+
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
|
| 264 |
+
|
| 265 |
+
if (_Drop) {
|
| 266 |
+
(void)atomicAdd_block(&awbarrier->split.expected, count);
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
|
| 270 |
+
)
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 274 |
+
bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
|
| 275 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 276 |
+
|
| 277 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 278 |
+
uint32_t __wait_complete;
|
| 279 |
+
|
| 280 |
+
asm volatile ("{"
|
| 281 |
+
" .reg .pred %%p;"
|
| 282 |
+
" mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
|
| 283 |
+
" selp.b32 %0, 1, 0, %%p;"
|
| 284 |
+
"}"
|
| 285 |
+
: "=r"(__wait_complete)
|
| 286 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
|
| 287 |
+
: "memory");
|
| 288 |
+
return bool(__wait_complete);
|
| 289 |
+
)
|
| 290 |
+
NV_IF_TARGET(NV_PROVIDES_SM_70,
|
| 291 |
+
volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
|
| 292 |
+
|
| 293 |
+
return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
|
| 294 |
+
)
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 298 |
+
bool awbarrier_test_wait_parity(uint64_t* barrier, bool phase_parity) {
|
| 299 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 300 |
+
|
| 301 |
+
NV_IF_TARGET(NV_PROVIDES_SM_90,
|
| 302 |
+
uint32_t __wait_complete = 0;
|
| 303 |
+
|
| 304 |
+
asm volatile ("{"
|
| 305 |
+
".reg .pred %%p;"
|
| 306 |
+
"mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
|
| 307 |
+
"selp.b32 %0, 1, 0, %%p;"
|
| 308 |
+
"}"
|
| 309 |
+
: "=r"(__wait_complete)
|
| 310 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity))
|
| 311 |
+
: "memory");
|
| 312 |
+
|
| 313 |
+
return __wait_complete;
|
| 314 |
+
)
|
| 315 |
+
_CUDA_AWBARRIER_ABORT()
|
| 316 |
+
return false;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 320 |
+
bool awbarrier_try_wait(uint64_t* barrier, uint64_t token, uint32_t max_sleep_nanosec) {
|
| 321 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 322 |
+
|
| 323 |
+
NV_IF_TARGET(NV_PROVIDES_SM_90,
|
| 324 |
+
uint32_t __wait_complete = 0;
|
| 325 |
+
|
| 326 |
+
asm volatile ("{\n\t"
|
| 327 |
+
".reg .pred p;\n\t"
|
| 328 |
+
"mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
|
| 329 |
+
"selp.b32 %0, 1, 0, p;\n\t"
|
| 330 |
+
"}"
|
| 331 |
+
: "=r"(__wait_complete)
|
| 332 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "l"(token), "r"(max_sleep_nanosec)
|
| 333 |
+
: "memory");
|
| 334 |
+
|
| 335 |
+
return __wait_complete;
|
| 336 |
+
)
|
| 337 |
+
_CUDA_AWBARRIER_ABORT()
|
| 338 |
+
return false;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 342 |
+
bool awbarrier_try_wait_parity(uint64_t* barrier, bool phase_parity, uint32_t max_sleep_nanosec) {
|
| 343 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 344 |
+
|
| 345 |
+
NV_IF_TARGET(NV_PROVIDES_SM_90,
|
| 346 |
+
uint32_t __wait_complete = 0;
|
| 347 |
+
|
| 348 |
+
asm volatile ("{\n\t"
|
| 349 |
+
".reg .pred p;\n\t"
|
| 350 |
+
"mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t"
|
| 351 |
+
"selp.b32 %0, 1, 0, p;\n\t"
|
| 352 |
+
"}"
|
| 353 |
+
: "=r"(__wait_complete)
|
| 354 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity)), "r"(max_sleep_nanosec)
|
| 355 |
+
: "memory");
|
| 356 |
+
|
| 357 |
+
return __wait_complete;
|
| 358 |
+
)
|
| 359 |
+
_CUDA_AWBARRIER_ABORT()
|
| 360 |
+
return false;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
|
| 364 |
+
|
| 365 |
+
#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
|
| 51 |
+
#define _CUDA_AWBARRIER_PRIMITIVES_H_
|
| 52 |
+
|
| 53 |
+
#include "cuda_awbarrier_helpers.h"
|
| 54 |
+
|
| 55 |
+
#if !defined(_CUDA_AWBARRIER_SM_TARGET)
|
| 56 |
+
# error This file requires compute capability 7.0 or greater.
|
| 57 |
+
#endif
|
| 58 |
+
|
| 59 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
|
| 60 |
+
uint32_t __mbarrier_maximum_count() {
|
| 61 |
+
return _CUDA_AWBARRIER_MAX_COUNT;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 65 |
+
void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
|
| 66 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 70 |
+
void __mbarrier_inval(__mbarrier_t* barrier) {
|
| 71 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 75 |
+
__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
|
| 76 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 80 |
+
__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
|
| 81 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 85 |
+
bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
|
| 86 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 90 |
+
uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
|
| 91 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 95 |
+
bool __mbarrier_test_wait_parity(__mbarrier_t* barrier, bool phase_parity) {
|
| 96 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(barrier, phase_parity);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 100 |
+
bool __mbarrier_try_wait(__mbarrier_t* barrier, __mbarrier_token_t token, uint32_t max_sleep_nanosec) {
|
| 101 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait(barrier, token, max_sleep_nanosec);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 105 |
+
bool __mbarrier_try_wait_parity(__mbarrier_t* barrier, bool phase_parity, uint32_t max_sleep_nanosec) {
|
| 106 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait_parity(barrier, phase_parity, max_sleep_nanosec);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_device_runtime_api.h
ADDED
|
@@ -0,0 +1,889 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
|
| 51 |
+
#define __CUDA_DEVICE_RUNTIME_API_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC__) && !defined(__CUDACC_RTC__)
|
| 54 |
+
#include <stdlib.h>
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
/*******************************************************************************
|
| 58 |
+
* *
|
| 59 |
+
* *
|
| 60 |
+
* *
|
| 61 |
+
*******************************************************************************/
|
| 62 |
+
|
| 63 |
+
#if !defined(CUDA_FORCE_CDP1_IF_SUPPORTED) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_NVHPC_CUDA) && !(defined(_WIN32) && !defined(_WIN64))
|
| 64 |
+
#define __CUDA_INTERNAL_USE_CDP2
|
| 65 |
+
#endif
|
| 66 |
+
|
| 67 |
+
#if !defined(__CUDACC_RTC__)
|
| 68 |
+
|
| 69 |
+
#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
|
| 70 |
+
|
| 71 |
+
#if defined(__cplusplus)
|
| 72 |
+
extern "C" {
|
| 73 |
+
#endif
|
| 74 |
+
|
| 75 |
+
struct cudaFuncAttributes;
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
#ifndef __CUDA_INTERNAL_USE_CDP2
|
| 79 |
+
inline __device__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
|
| 80 |
+
{
|
| 81 |
+
return cudaErrorUnknown;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
inline __device__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
|
| 85 |
+
{
|
| 86 |
+
return cudaErrorUnknown;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
inline __device__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
|
| 90 |
+
{
|
| 91 |
+
return cudaErrorUnknown;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
inline __device__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
|
| 95 |
+
{
|
| 96 |
+
return cudaErrorUnknown;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
|
| 100 |
+
{
|
| 101 |
+
return cudaErrorUnknown;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
|
| 105 |
+
{
|
| 106 |
+
return cudaErrorUnknown;
|
| 107 |
+
}
|
| 108 |
+
#else // __CUDA_INTERNAL_USE_CDP2
|
| 109 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2Malloc(void **p, size_t s)
|
| 110 |
+
{
|
| 111 |
+
return cudaErrorUnknown;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
|
| 115 |
+
{
|
| 116 |
+
return cudaErrorUnknown;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
|
| 120 |
+
{
|
| 121 |
+
return cudaErrorUnknown;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device)
|
| 125 |
+
{
|
| 126 |
+
return cudaErrorUnknown;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
|
| 130 |
+
{
|
| 131 |
+
return cudaErrorUnknown;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
inline __device__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
|
| 135 |
+
{
|
| 136 |
+
return cudaErrorUnknown;
|
| 137 |
+
}
|
| 138 |
+
#endif // __CUDA_INTERNAL_USE_CDP2
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
#if defined(__cplusplus)
|
| 142 |
+
}
|
| 143 |
+
#endif
|
| 144 |
+
|
| 145 |
+
#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
|
| 146 |
+
|
| 147 |
+
#endif /* !defined(__CUDACC_RTC__) */
|
| 148 |
+
|
| 149 |
+
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 150 |
+
# define __DEPRECATED__(msg)
|
| 151 |
+
#elif defined(_WIN32)
|
| 152 |
+
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
|
| 153 |
+
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
|
| 154 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated))
|
| 155 |
+
#else
|
| 156 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
|
| 157 |
+
#endif
|
| 158 |
+
|
| 159 |
+
#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
|
| 160 |
+
# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated. Moreover, such use will cause this module to fail to load on sm_90+ devices. If calls to "#func_name" from device code cannot be removed for older devices at this time, you may guard them with __CUDA_ARCH__ macros to remove them only for sm_90+ devices, making sure to generate code for compute_90 for the macros to take effect. Note that this mitigation will no longer work when support for "#func_name" from device code is eventually dropped for all devices. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
|
| 161 |
+
#else
|
| 162 |
+
# define __CDPRT_DEPRECATED(func_name)
|
| 163 |
+
#endif
|
| 164 |
+
|
| 165 |
+
#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */
|
| 166 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
|
| 167 |
+
|
| 168 |
+
#include "driver_types.h"
|
| 169 |
+
#include "crt/host_defines.h"
|
| 170 |
+
|
| 171 |
+
#define cudaStreamGraphTailLaunch (cudaStream_t)0x0100000000000000
|
| 172 |
+
#define cudaStreamGraphFireAndForget (cudaStream_t)0x0200000000000000
|
| 173 |
+
#define cudaStreamGraphFireAndForgetAsSibling (cudaStream_t)0x0300000000000000
|
| 174 |
+
|
| 175 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 176 |
+
#define cudaStreamTailLaunch ((cudaStream_t)0x3) /**< Per-grid stream with a tail launch semantics. Only applicable when used with CUDA Dynamic Parallelism. */
|
| 177 |
+
#define cudaStreamFireAndForget ((cudaStream_t)0x4) /**< Per-grid stream with a fire-and-forget synchronization behavior. Only applicable when used with CUDA Dynamic Parallelism. */
|
| 178 |
+
#endif
|
| 179 |
+
|
| 180 |
+
extern "C"
|
| 181 |
+
{
|
| 182 |
+
|
| 183 |
+
// Symbols beginning with __cudaCDP* should not be used outside
|
| 184 |
+
// this header file. Instead, compile with -DCUDA_FORCE_CDP1_IF_SUPPORTED if
|
| 185 |
+
// CDP1 support is required.
|
| 186 |
+
|
| 187 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
|
| 188 |
+
|
| 189 |
+
#ifndef __CUDA_INTERNAL_USE_CDP2
|
| 190 |
+
//// CDP1 endpoints
|
| 191 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
|
| 192 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
|
| 193 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
|
| 194 |
+
extern __DEPRECATED__("cudaDeviceGetSharedMemConfig deprecated") __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
|
| 195 |
+
#if (__CUDA_ARCH__ < 900) && (defined(CUDA_FORCE_CDP1_IF_SUPPORTED) || (defined(_WIN32) && !defined(_WIN64)))
|
| 196 |
+
// cudaDeviceSynchronize is removed on sm_90+
|
| 197 |
+
extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
|
| 198 |
+
#endif
|
| 199 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
|
| 200 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
|
| 201 |
+
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
|
| 202 |
+
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
|
| 203 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
|
| 204 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
|
| 205 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
|
| 206 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
|
| 207 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
| 208 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
| 209 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
|
| 210 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
|
| 211 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
|
| 212 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
|
| 213 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
|
| 214 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
|
| 215 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
|
| 216 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
|
| 217 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
|
| 218 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 219 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 220 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 221 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 222 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
| 223 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
| 224 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
|
| 225 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
|
| 226 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
| 227 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
| 228 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
| 229 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
| 230 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
|
| 231 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
|
| 232 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
|
| 233 |
+
#endif // __CUDA_INTERNAL_USE_CDP2
|
| 234 |
+
|
| 235 |
+
//// CDP2 endpoints
|
| 236 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
|
| 237 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetLimit(size_t *pValue, enum cudaLimit limit);
|
| 238 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
|
| 239 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
|
| 240 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetLastError(void);
|
| 241 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2PeekAtLastError(void);
|
| 242 |
+
extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorString(cudaError_t error);
|
| 243 |
+
extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorName(cudaError_t error);
|
| 244 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDeviceCount(int *count);
|
| 245 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device);
|
| 246 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
|
| 247 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamDestroy(cudaStream_t stream);
|
| 248 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
| 249 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
| 250 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
|
| 251 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord(cudaEvent_t event, cudaStream_t stream);
|
| 252 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
|
| 253 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
|
| 254 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
|
| 255 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventDestroy(cudaEvent_t event);
|
| 256 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
|
| 257 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Free(void *devPtr);
|
| 258 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Malloc(void **devPtr, size_t size);
|
| 259 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 260 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 261 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 262 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
| 263 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
| 264 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
| 265 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
|
| 266 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
|
| 267 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
| 268 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
| 269 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
| 270 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
| 271 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2RuntimeGetVersion(int *runtimeVersion);
|
| 272 |
+
extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBuffer(size_t alignment, size_t size);
|
| 273 |
+
extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
|
| 274 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
| 275 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
|
| 276 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
| 277 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
|
| 278 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
|
| 279 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
|
| 283 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 284 |
+
static inline __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch_ptsz(cudaGraphExec_t graphExec, cudaStream_t stream)
|
| 285 |
+
{
|
| 286 |
+
if (stream == 0) {
|
| 287 |
+
stream = cudaStreamPerThread;
|
| 288 |
+
}
|
| 289 |
+
return cudaGraphLaunch(graphExec, stream);
|
| 290 |
+
}
|
| 291 |
+
#endif
|
| 292 |
+
|
| 293 |
+
/**
|
| 294 |
+
* \ingroup CUDART_GRAPH
|
| 295 |
+
* \brief Get the currently running device graph id.
|
| 296 |
+
*
|
| 297 |
+
* Get the currently running device graph id.
|
| 298 |
+
* \return Returns the current device graph id, 0 if the call is outside of a device graph.
|
| 299 |
+
* \sa cudaGraphLaunch
|
| 300 |
+
*/
|
| 301 |
+
static inline __device__ __cudart_builtin__ cudaGraphExec_t CUDARTAPI cudaGetCurrentGraphExec(void)
|
| 302 |
+
{
|
| 303 |
+
unsigned long long current_graph_exec;
|
| 304 |
+
asm ("mov.u64 %0, %%current_graph_exec;" : "=l"(current_graph_exec));
|
| 305 |
+
return (cudaGraphExec_t)current_graph_exec;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
/**
|
| 309 |
+
* \ingroup CUDART_GRAPH
|
| 310 |
+
* \brief Updates the kernel parameters of the given kernel node
|
| 311 |
+
*
|
| 312 |
+
* Updates \p size bytes in the kernel parameters of \p node at \p offset to
|
| 313 |
+
* the contents of \p value. \p node must be device-updatable, and must reside upon the same
|
| 314 |
+
* device as the calling kernel.
|
| 315 |
+
*
|
| 316 |
+
* If this function is called for the node's immediate dependent and that dependent is configured
|
| 317 |
+
* for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
|
| 318 |
+
* kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
|
| 319 |
+
* that the update is visible to that dependent node before it is launched.
|
| 320 |
+
*
|
| 321 |
+
* \param node - The node to update
|
| 322 |
+
* \param offset - The offset into the params at which to make the update
|
| 323 |
+
* \param value - Buffer containing the params to write
|
| 324 |
+
* \param size - Size in bytes to update
|
| 325 |
+
*
|
| 326 |
+
* \return
|
| 327 |
+
* cudaSucces,
|
| 328 |
+
* cudaErrorInvalidValue
|
| 329 |
+
* \notefnerr
|
| 330 |
+
*
|
| 331 |
+
* \sa
|
| 332 |
+
* ::cudaGraphKernelNodeSetEnabled,
|
| 333 |
+
* ::cudaGraphKernelNodeSetGridDim,
|
| 334 |
+
* ::cudaGraphKernelNodeUpdatesApply
|
| 335 |
+
*/
|
| 336 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const void *value , size_t size);
|
| 337 |
+
|
| 338 |
+
/**
|
| 339 |
+
* \ingroup CUDART_GRAPH
|
| 340 |
+
* \brief Enables or disables the given kernel node
|
| 341 |
+
*
|
| 342 |
+
* Enables or disables \p node based upon \p enable. If \p enable is true, the node will be enabled;
|
| 343 |
+
* if it is false, the node will be disabled. Disabled nodes will act as a NOP during execution.
|
| 344 |
+
* \p node must be device-updatable, and must reside upon the same device as the calling kernel.
|
| 345 |
+
*
|
| 346 |
+
* If this function is called for the node's immediate dependent and that dependent is configured
|
| 347 |
+
* for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
|
| 348 |
+
* kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
|
| 349 |
+
* that the update is visible to that dependent node before it is launched.
|
| 350 |
+
*
|
| 351 |
+
* \param node - The node to update
|
| 352 |
+
* \param enable - Whether to enable or disable the node
|
| 353 |
+
*
|
| 354 |
+
* \return
|
| 355 |
+
* cudaSucces,
|
| 356 |
+
* cudaErrorInvalidValue
|
| 357 |
+
* \notefnerr
|
| 358 |
+
*
|
| 359 |
+
* \sa
|
| 360 |
+
* ::cudaGraphKernelNodeSetParam,
|
| 361 |
+
* ::cudaGraphKernelNodeSetGridDim,
|
| 362 |
+
* ::cudaGraphKernelNodeUpdatesApply
|
| 363 |
+
*/
|
| 364 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetEnabled(cudaGraphDeviceNode_t node, bool enable);
|
| 365 |
+
|
| 366 |
+
/**
|
| 367 |
+
* \ingroup CUDART_GRAPH
|
| 368 |
+
* \brief Updates the grid dimensions of the given kernel node
|
| 369 |
+
*
|
| 370 |
+
* Sets the grid dimensions of \p node to \p gridDim. \p node must be device-updatable,
|
| 371 |
+
* and must reside upon the same device as thecalling kernel.
|
| 372 |
+
*
|
| 373 |
+
* If this function is called for the node's immediate dependent and that dependent is configured
|
| 374 |
+
* for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
|
| 375 |
+
* kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
|
| 376 |
+
* that the update is visible to that dependent node before it is launched.
|
| 377 |
+
*
|
| 378 |
+
* \param node - The node to update
|
| 379 |
+
* \param gridDim - The grid dimensions to set
|
| 380 |
+
*
|
| 381 |
+
* \return
|
| 382 |
+
* cudaSucces,
|
| 383 |
+
* cudaErrorInvalidValue
|
| 384 |
+
* \notefnerr
|
| 385 |
+
*
|
| 386 |
+
* \sa
|
| 387 |
+
* ::cudaGraphKernelNodeSetParam,
|
| 388 |
+
* ::cudaGraphKernelNodeSetEnabled,
|
| 389 |
+
* ::cudaGraphKernelNodeUpdatesApply
|
| 390 |
+
*/
|
| 391 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetGridDim(cudaGraphDeviceNode_t node, dim3 gridDim);
|
| 392 |
+
|
| 393 |
+
/**
|
| 394 |
+
* \ingroup CUDART_GRAPH
|
| 395 |
+
* \brief Batch applies multiple kernel node updates
|
| 396 |
+
*
|
| 397 |
+
* Batch applies one or more kernel node updates based on the information provided in \p updates.
|
| 398 |
+
* \p updateCount specifies the number of updates to apply. Each entry in \p updates must specify
|
| 399 |
+
* a node to update, the type of update to apply, and the parameters for that type of update. See
|
| 400 |
+
* the documentation for ::cudaGraphKernelNodeUpdate for more detail.
|
| 401 |
+
*
|
| 402 |
+
* If this function is called for the node's immediate dependent and that dependent is configured
|
| 403 |
+
* for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
|
| 404 |
+
* kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
|
| 405 |
+
* that the update is visible to that dependent node before it is launched.
|
| 406 |
+
*
|
| 407 |
+
* \param updates - The updates to apply
|
| 408 |
+
* \param updateCount - The number of updates to apply
|
| 409 |
+
*
|
| 410 |
+
* \return
|
| 411 |
+
* cudaSucces,
|
| 412 |
+
* cudaErrorInvalidValue
|
| 413 |
+
* \notefnerr
|
| 414 |
+
*
|
| 415 |
+
* \sa
|
| 416 |
+
* ::cudaGraphKernelNodeSetParam,
|
| 417 |
+
* ::cudaGraphKernelNodeSetEnabled,
|
| 418 |
+
* ::cudaGraphKernelNodeSetGridDim
|
| 419 |
+
*/
|
| 420 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeUpdatesApply(const cudaGraphKernelNodeUpdate *updates, size_t updateCount);
|
| 421 |
+
|
| 422 |
+
/**
|
| 423 |
+
* \ingroup CUDART_EXECUTION
|
| 424 |
+
* \brief Programmatic dependency trigger
|
| 425 |
+
*
|
| 426 |
+
* This device function ensures the programmatic launch completion edges /
|
| 427 |
+
* events are fulfilled. See
|
| 428 |
+
* ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
|
| 429 |
+
* and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
|
| 430 |
+
* information. The event / edge kick off only happens when every CTAs
|
| 431 |
+
* in the grid has either exited or called this function at least once,
|
| 432 |
+
* otherwise the kick off happens automatically after all warps finishes
|
| 433 |
+
* execution but before the grid completes. The kick off only enables
|
| 434 |
+
* scheduling of the secondary kernel. It provides no memory visibility
|
| 435 |
+
* guarantee itself. The user could enforce memory visibility by inserting a
|
| 436 |
+
* memory fence of the correct scope.
|
| 437 |
+
*/
|
| 438 |
+
static inline __device__ __cudart_builtin__ void CUDARTAPI cudaTriggerProgrammaticLaunchCompletion(void)
|
| 439 |
+
{
|
| 440 |
+
asm volatile("griddepcontrol.launch_dependents;":::);
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
/**
|
| 444 |
+
* \ingroup CUDART_EXECUTION
|
| 445 |
+
* \brief Programmatic grid dependency synchronization
|
| 446 |
+
*
|
| 447 |
+
* This device function will block the thread until all direct grid
|
| 448 |
+
* dependencies have completed. This API is intended to use in conjuncture with
|
| 449 |
+
* programmatic / launch event / dependency. See
|
| 450 |
+
* ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
|
| 451 |
+
* and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
|
| 452 |
+
* information.
|
| 453 |
+
*/
|
| 454 |
+
static inline __device__ __cudart_builtin__ void CUDARTAPI cudaGridDependencySynchronize(void)
|
| 455 |
+
{
|
| 456 |
+
asm volatile("griddepcontrol.wait;":::"memory");
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
/**
|
| 460 |
+
* \ingroup CUDART_GRAPH
|
| 461 |
+
* \brief Sets the condition value associated with a conditional node.
|
| 462 |
+
*
|
| 463 |
+
* Sets the condition value associated with a conditional node.
|
| 464 |
+
* \sa cudaGraphConditionalHandleCreate
|
| 465 |
+
*/
|
| 466 |
+
extern __device__ __cudart_builtin__ void CUDARTAPI cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
|
| 467 |
+
|
| 468 |
+
//// CG API
|
| 469 |
+
extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
|
| 470 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
|
| 471 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
|
| 472 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
|
| 473 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
//// CDP API
|
| 477 |
+
|
| 478 |
+
#ifdef __CUDA_ARCH__
|
| 479 |
+
|
| 480 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 481 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
|
| 482 |
+
{
|
| 483 |
+
return __cudaCDP2DeviceGetAttribute(value, attr, device);
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit)
|
| 487 |
+
{
|
| 488 |
+
return __cudaCDP2DeviceGetLimit(pValue, limit);
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig)
|
| 492 |
+
{
|
| 493 |
+
return __cudaCDP2DeviceGetCacheConfig(pCacheConfig);
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig)
|
| 497 |
+
{
|
| 498 |
+
return __cudaCDP2DeviceGetSharedMemConfig(pConfig);
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void)
|
| 502 |
+
{
|
| 503 |
+
return __cudaCDP2GetLastError();
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void)
|
| 507 |
+
{
|
| 508 |
+
return __cudaCDP2PeekAtLastError();
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error)
|
| 512 |
+
{
|
| 513 |
+
return __cudaCDP2GetErrorString(error);
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error)
|
| 517 |
+
{
|
| 518 |
+
return __cudaCDP2GetErrorName(error);
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count)
|
| 522 |
+
{
|
| 523 |
+
return __cudaCDP2GetDeviceCount(count);
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
|
| 527 |
+
{
|
| 528 |
+
return __cudaCDP2GetDevice(device);
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags)
|
| 532 |
+
{
|
| 533 |
+
return __cudaCDP2StreamCreateWithFlags(pStream, flags);
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream)
|
| 537 |
+
{
|
| 538 |
+
return __cudaCDP2StreamDestroy(stream);
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
|
| 542 |
+
{
|
| 543 |
+
return __cudaCDP2StreamWaitEvent(stream, event, flags);
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
|
| 547 |
+
{
|
| 548 |
+
return __cudaCDP2StreamWaitEvent_ptsz(stream, event, flags);
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags)
|
| 552 |
+
{
|
| 553 |
+
return __cudaCDP2EventCreateWithFlags(event, flags);
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream)
|
| 557 |
+
{
|
| 558 |
+
return __cudaCDP2EventRecord(event, stream);
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream)
|
| 562 |
+
{
|
| 563 |
+
return __cudaCDP2EventRecord_ptsz(event, stream);
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
|
| 567 |
+
{
|
| 568 |
+
return __cudaCDP2EventRecordWithFlags(event, stream, flags);
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
|
| 572 |
+
{
|
| 573 |
+
return __cudaCDP2EventRecordWithFlags_ptsz(event, stream, flags);
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event)
|
| 577 |
+
{
|
| 578 |
+
return __cudaCDP2EventDestroy(event);
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func)
|
| 582 |
+
{
|
| 583 |
+
return __cudaCDP2FuncGetAttributes(attr, func);
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr)
|
| 587 |
+
{
|
| 588 |
+
return __cudaCDP2Free(devPtr);
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size)
|
| 592 |
+
{
|
| 593 |
+
return __cudaCDP2Malloc(devPtr, size);
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
|
| 597 |
+
{
|
| 598 |
+
return __cudaCDP2MemcpyAsync(dst, src, count, kind, stream);
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
|
| 602 |
+
{
|
| 603 |
+
return __cudaCDP2MemcpyAsync_ptsz(dst, src, count, kind, stream);
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
|
| 607 |
+
{
|
| 608 |
+
return __cudaCDP2Memcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
|
| 612 |
+
{
|
| 613 |
+
return __cudaCDP2Memcpy2DAsync_ptsz(dst, dpitch, src, spitch, width, height, kind, stream);
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
|
| 617 |
+
{
|
| 618 |
+
return __cudaCDP2Memcpy3DAsync(p, stream);
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
|
| 622 |
+
{
|
| 623 |
+
return __cudaCDP2Memcpy3DAsync_ptsz(p, stream);
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream)
|
| 627 |
+
{
|
| 628 |
+
return __cudaCDP2MemsetAsync(devPtr, value, count, stream);
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream)
|
| 632 |
+
{
|
| 633 |
+
return __cudaCDP2MemsetAsync_ptsz(devPtr, value, count, stream);
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
|
| 637 |
+
{
|
| 638 |
+
return __cudaCDP2Memset2DAsync(devPtr, pitch, value, width, height, stream);
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
|
| 642 |
+
{
|
| 643 |
+
return __cudaCDP2Memset2DAsync_ptsz(devPtr, pitch, value, width, height, stream);
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
|
| 647 |
+
{
|
| 648 |
+
return __cudaCDP2Memset3DAsync(pitchedDevPtr, value, extent, stream);
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
|
| 652 |
+
{
|
| 653 |
+
return __cudaCDP2Memset3DAsync_ptsz(pitchedDevPtr, value, extent, stream);
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion)
|
| 657 |
+
{
|
| 658 |
+
return __cudaCDP2RuntimeGetVersion(runtimeVersion);
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
|
| 662 |
+
{
|
| 663 |
+
return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSmemSize);
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
|
| 667 |
+
{
|
| 668 |
+
return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSmemSize, flags);
|
| 669 |
+
}
|
| 670 |
+
#endif // __CUDA_INTERNAL_USE_CDP2
|
| 671 |
+
|
| 672 |
+
#endif // __CUDA_ARCH__
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
/**
|
| 676 |
+
* \ingroup CUDART_EXECUTION
|
| 677 |
+
* \brief Obtains a parameter buffer
|
| 678 |
+
*
|
| 679 |
+
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
|
| 680 |
+
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
|
| 681 |
+
*
|
| 682 |
+
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
|
| 683 |
+
* CUDA user code should use <<< >>> to launch kernels.
|
| 684 |
+
*
|
| 685 |
+
* \param alignment - Specifies alignment requirement of the parameter buffer
|
| 686 |
+
* \param size - Specifies size requirement in bytes
|
| 687 |
+
*
|
| 688 |
+
* \return
|
| 689 |
+
* Returns pointer to the allocated parameterBuffer
|
| 690 |
+
* \notefnerr
|
| 691 |
+
*
|
| 692 |
+
* \sa cudaLaunchDevice
|
| 693 |
+
*/
|
| 694 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 695 |
+
static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size)
|
| 696 |
+
{
|
| 697 |
+
return __cudaCDP2GetParameterBuffer(alignment, size);
|
| 698 |
+
}
|
| 699 |
+
#else
|
| 700 |
+
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
|
| 701 |
+
#endif
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 705 |
+
static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize)
|
| 706 |
+
{
|
| 707 |
+
return __cudaCDP2GetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize);
|
| 708 |
+
}
|
| 709 |
+
#else
|
| 710 |
+
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
|
| 711 |
+
#endif
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 715 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
|
| 716 |
+
{
|
| 717 |
+
return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
|
| 718 |
+
}
|
| 719 |
+
|
| 720 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream)
|
| 721 |
+
{
|
| 722 |
+
return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
|
| 723 |
+
}
|
| 724 |
+
#else
|
| 725 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
| 726 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
|
| 727 |
+
#endif
|
| 728 |
+
|
| 729 |
+
|
| 730 |
+
/**
|
| 731 |
+
* \ingroup CUDART_EXECUTION
|
| 732 |
+
* \brief Launches a specified kernel
|
| 733 |
+
*
|
| 734 |
+
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
|
| 735 |
+
* by calling ::cudaGetParameterBuffer().
|
| 736 |
+
*
|
| 737 |
+
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
|
| 738 |
+
* CUDA user code should use <<< >>> to launch the kernels.
|
| 739 |
+
*
|
| 740 |
+
* \param func - Pointer to the kernel to be launched
|
| 741 |
+
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
|
| 742 |
+
* \param gridDimension - Specifies grid dimensions
|
| 743 |
+
* \param blockDimension - Specifies block dimensions
|
| 744 |
+
* \param sharedMemSize - Specifies size of shared memory
|
| 745 |
+
* \param stream - Specifies the stream to be used
|
| 746 |
+
*
|
| 747 |
+
* \return
|
| 748 |
+
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
|
| 749 |
+
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
|
| 750 |
+
* \notefnerr
|
| 751 |
+
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
|
| 752 |
+
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
|
| 753 |
+
*
|
| 754 |
+
* \sa cudaGetParameterBuffer
|
| 755 |
+
*/
|
| 756 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
|
| 757 |
+
// When compiling for the device and per thread default stream is enabled, add
|
| 758 |
+
// a static inline redirect to the per thread stream entry points.
|
| 759 |
+
|
| 760 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
|
| 761 |
+
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
|
| 762 |
+
{
|
| 763 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 764 |
+
return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
|
| 765 |
+
#else
|
| 766 |
+
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
|
| 767 |
+
#endif
|
| 768 |
+
}
|
| 769 |
+
|
| 770 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
|
| 771 |
+
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
|
| 772 |
+
{
|
| 773 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 774 |
+
return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
|
| 775 |
+
#else
|
| 776 |
+
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
|
| 777 |
+
#endif
|
| 778 |
+
}
|
| 779 |
+
#else // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
|
| 780 |
+
#ifdef __CUDA_INTERNAL_USE_CDP2
|
| 781 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
|
| 782 |
+
{
|
| 783 |
+
return __cudaCDP2LaunchDevice(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
|
| 787 |
+
{
|
| 788 |
+
return __cudaCDP2LaunchDeviceV2(parameterBuffer, stream);
|
| 789 |
+
}
|
| 790 |
+
#else // __CUDA_INTERNAL_USE_CDP2
|
| 791 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
| 792 |
+
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
|
| 793 |
+
#endif // __CUDA_INTERNAL_USE_CDP2
|
| 794 |
+
#endif // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
|
| 795 |
+
|
| 796 |
+
|
| 797 |
+
// These symbols should not be used outside of this header file.
|
| 798 |
+
#define __cudaCDP2DeviceGetAttribute
|
| 799 |
+
#define __cudaCDP2DeviceGetLimit
|
| 800 |
+
#define __cudaCDP2DeviceGetCacheConfig
|
| 801 |
+
#define __cudaCDP2DeviceGetSharedMemConfig
|
| 802 |
+
#define __cudaCDP2GetLastError
|
| 803 |
+
#define __cudaCDP2PeekAtLastError
|
| 804 |
+
#define __cudaCDP2GetErrorString
|
| 805 |
+
#define __cudaCDP2GetErrorName
|
| 806 |
+
#define __cudaCDP2GetDeviceCount
|
| 807 |
+
#define __cudaCDP2GetDevice
|
| 808 |
+
#define __cudaCDP2StreamCreateWithFlags
|
| 809 |
+
#define __cudaCDP2StreamDestroy
|
| 810 |
+
#define __cudaCDP2StreamWaitEvent
|
| 811 |
+
#define __cudaCDP2StreamWaitEvent_ptsz
|
| 812 |
+
#define __cudaCDP2EventCreateWithFlags
|
| 813 |
+
#define __cudaCDP2EventRecord
|
| 814 |
+
#define __cudaCDP2EventRecord_ptsz
|
| 815 |
+
#define __cudaCDP2EventRecordWithFlags
|
| 816 |
+
#define __cudaCDP2EventRecordWithFlags_ptsz
|
| 817 |
+
#define __cudaCDP2EventDestroy
|
| 818 |
+
#define __cudaCDP2FuncGetAttributes
|
| 819 |
+
#define __cudaCDP2Free
|
| 820 |
+
#define __cudaCDP2Malloc
|
| 821 |
+
#define __cudaCDP2MemcpyAsync
|
| 822 |
+
#define __cudaCDP2MemcpyAsync_ptsz
|
| 823 |
+
#define __cudaCDP2Memcpy2DAsync
|
| 824 |
+
#define __cudaCDP2Memcpy2DAsync_ptsz
|
| 825 |
+
#define __cudaCDP2Memcpy3DAsync
|
| 826 |
+
#define __cudaCDP2Memcpy3DAsync_ptsz
|
| 827 |
+
#define __cudaCDP2MemsetAsync
|
| 828 |
+
#define __cudaCDP2MemsetAsync_ptsz
|
| 829 |
+
#define __cudaCDP2Memset2DAsync
|
| 830 |
+
#define __cudaCDP2Memset2DAsync_ptsz
|
| 831 |
+
#define __cudaCDP2Memset3DAsync
|
| 832 |
+
#define __cudaCDP2Memset3DAsync_ptsz
|
| 833 |
+
#define __cudaCDP2RuntimeGetVersion
|
| 834 |
+
#define __cudaCDP2GetParameterBuffer
|
| 835 |
+
#define __cudaCDP2GetParameterBufferV2
|
| 836 |
+
#define __cudaCDP2LaunchDevice_ptsz
|
| 837 |
+
#define __cudaCDP2LaunchDeviceV2_ptsz
|
| 838 |
+
#define __cudaCDP2LaunchDevice
|
| 839 |
+
#define __cudaCDP2LaunchDeviceV2
|
| 840 |
+
#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor
|
| 841 |
+
#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 842 |
+
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
|
| 846 |
+
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
|
| 847 |
+
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
|
| 848 |
+
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
|
| 849 |
+
|
| 850 |
+
/**
|
| 851 |
+
* \ingroup CUDART_GRAPH
|
| 852 |
+
* \brief Updates the kernel parameters of the given kernel node
|
| 853 |
+
*
|
| 854 |
+
* Updates the kernel parameters of \p node at \p offset to \p value. \p node must be
|
| 855 |
+
* device-updatable, and must reside upon the same device as the calling kernel.
|
| 856 |
+
*
|
| 857 |
+
* If this function is called for the node's immediate dependent and that dependent is configured
|
| 858 |
+
* for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
|
| 859 |
+
* kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
|
| 860 |
+
* that the update is visible to that dependent node before it is launched.
|
| 861 |
+
*
|
| 862 |
+
* \param node - The node to update
|
| 863 |
+
* \param offset - The offset into the params at which to make the update
|
| 864 |
+
* \param value - Parameter value to write
|
| 865 |
+
*
|
| 866 |
+
* \return
|
| 867 |
+
* cudaSucces,
|
| 868 |
+
* cudaErrorInvalidValue
|
| 869 |
+
* \notefnerr
|
| 870 |
+
*
|
| 871 |
+
* \sa
|
| 872 |
+
* ::etblGraphKernelNodeSetEnabled,
|
| 873 |
+
* ::etblGraphKernelNodeSetGridDim,
|
| 874 |
+
* ::etblGraphKernelNodeUpdatesApply
|
| 875 |
+
*/
|
| 876 |
+
template <typename T>
|
| 877 |
+
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const T &value)
|
| 878 |
+
{
|
| 879 |
+
return cudaGraphKernelNodeSetParam(node, offset, &value, sizeof(T));
|
| 880 |
+
}
|
| 881 |
+
|
| 882 |
+
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
|
| 883 |
+
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
|
| 884 |
+
|
| 885 |
+
#undef __DEPRECATED__
|
| 886 |
+
#undef __CDPRT_DEPRECATED
|
| 887 |
+
#undef __CUDA_INTERNAL_USE_CDP2
|
| 888 |
+
|
| 889 |
+
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_EGL_INTEROP_H__)
|
| 51 |
+
#define __CUDA_EGL_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
#include "cuda_runtime.h"
|
| 55 |
+
#include "cudart_platform.h"
|
| 56 |
+
#include "EGL/egl.h"
|
| 57 |
+
#include "EGL/eglext.h"
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus)
|
| 60 |
+
extern "C" {
|
| 61 |
+
#endif /* __cplusplus */
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* \addtogroup CUDART_TYPES
|
| 65 |
+
* @{
|
| 66 |
+
*/
|
| 67 |
+
|
| 68 |
+
/**
|
| 69 |
+
* Maximum number of planes per frame
|
| 70 |
+
*/
|
| 71 |
+
#define CUDA_EGL_MAX_PLANES 3
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
* CUDA EglFrame type - array or pointer
|
| 75 |
+
*/
|
| 76 |
+
typedef enum cudaEglFrameType_enum
|
| 77 |
+
{
|
| 78 |
+
cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */
|
| 79 |
+
cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */
|
| 80 |
+
} cudaEglFrameType;
|
| 81 |
+
|
| 82 |
+
/**
|
| 83 |
+
* Resource location flags- sysmem or vidmem
|
| 84 |
+
*
|
| 85 |
+
* For CUDA context on iGPU, since video and system memory are equivalent -
|
| 86 |
+
* these flags will not have an effect on the execution.
|
| 87 |
+
*
|
| 88 |
+
* For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
|
| 89 |
+
* to give a hint about the desired location.
|
| 90 |
+
*
|
| 91 |
+
* ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
|
| 92 |
+
* to be accessed by CUDA.
|
| 93 |
+
*
|
| 94 |
+
* ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
|
| 95 |
+
* video memory to be accessed by CUDA.
|
| 96 |
+
*
|
| 97 |
+
* There may be an additional latency due to new allocation and data migration,
|
| 98 |
+
* if the frame is produced on a different memory.
|
| 99 |
+
*/
|
| 100 |
+
typedef enum cudaEglResourceLocationFlags_enum {
|
| 101 |
+
cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */
|
| 102 |
+
cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */
|
| 103 |
+
} cudaEglResourceLocationFlags;
|
| 104 |
+
|
| 105 |
+
/**
|
| 106 |
+
* CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
|
| 107 |
+
*/
|
| 108 |
+
typedef enum cudaEglColorFormat_enum {
|
| 109 |
+
cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 110 |
+
cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
|
| 111 |
+
cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 112 |
+
cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
|
| 113 |
+
cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
|
| 114 |
+
cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
|
| 115 |
+
cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */
|
| 116 |
+
cudaEglColorFormatR = 9, /**< single color channel in one surface. */
|
| 117 |
+
cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 118 |
+
cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
|
| 119 |
+
cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 120 |
+
cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 121 |
+
cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
|
| 122 |
+
cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
|
| 123 |
+
cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */
|
| 124 |
+
cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
|
| 125 |
+
cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 126 |
+
cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 127 |
+
cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 128 |
+
cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 129 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 130 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 131 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 132 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 133 |
+
cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 134 |
+
cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 135 |
+
cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 136 |
+
cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 137 |
+
cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 138 |
+
cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 139 |
+
cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 140 |
+
cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 141 |
+
cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 142 |
+
cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 143 |
+
cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 144 |
+
cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 145 |
+
cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 146 |
+
cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 147 |
+
cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 148 |
+
cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 149 |
+
cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 150 |
+
cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 151 |
+
cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
|
| 152 |
+
cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
|
| 153 |
+
cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
|
| 154 |
+
cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
|
| 155 |
+
cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 156 |
+
cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 157 |
+
cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 158 |
+
cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 159 |
+
cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 160 |
+
cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 161 |
+
cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 162 |
+
cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 163 |
+
cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 164 |
+
cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 165 |
+
cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 166 |
+
cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 167 |
+
cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 168 |
+
cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 169 |
+
cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 170 |
+
cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 171 |
+
cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 172 |
+
cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 173 |
+
cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 174 |
+
cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
|
| 175 |
+
cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
|
| 176 |
+
cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
|
| 177 |
+
cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
|
| 178 |
+
cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
|
| 179 |
+
cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
|
| 180 |
+
cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
|
| 181 |
+
cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
|
| 182 |
+
cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 183 |
+
cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 184 |
+
cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 185 |
+
cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 186 |
+
cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 187 |
+
cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 188 |
+
cudaEglColorFormatY = 82, /**< Color format for single Y plane. */
|
| 189 |
+
cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 190 |
+
cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 191 |
+
cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 192 |
+
cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 193 |
+
cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 194 |
+
cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 195 |
+
cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 196 |
+
cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 197 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 198 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 199 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 200 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 201 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 202 |
+
cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */
|
| 203 |
+
cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */
|
| 204 |
+
cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */
|
| 205 |
+
cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */
|
| 206 |
+
cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */
|
| 207 |
+
cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */
|
| 208 |
+
cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 209 |
+
cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 210 |
+
cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 211 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 212 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 213 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 214 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 215 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 216 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 217 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 218 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 219 |
+
} cudaEglColorFormat;
|
| 220 |
+
|
| 221 |
+
/**
|
| 222 |
+
* CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
|
| 223 |
+
*/
|
| 224 |
+
typedef struct cudaEglPlaneDesc_st {
|
| 225 |
+
unsigned int width; /**< Width of plane */
|
| 226 |
+
unsigned int height; /**< Height of plane */
|
| 227 |
+
unsigned int depth; /**< Depth of plane */
|
| 228 |
+
unsigned int pitch; /**< Pitch of plane */
|
| 229 |
+
unsigned int numChannels; /**< Number of channels for the plane */
|
| 230 |
+
struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */
|
| 231 |
+
unsigned int reserved[4]; /**< Reserved for future use */
|
| 232 |
+
} cudaEglPlaneDesc;
|
| 233 |
+
|
| 234 |
+
/**
|
| 235 |
+
* CUDA EGLFrame Descriptor - structure defining one frame of EGL.
|
| 236 |
+
*
|
| 237 |
+
* Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
|
| 238 |
+
* Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
|
| 239 |
+
* \code
|
| 240 |
+
* typedef struct cudaEglPlaneDesc_st {
|
| 241 |
+
* unsigned int width;
|
| 242 |
+
* unsigned int height;
|
| 243 |
+
* unsigned int depth;
|
| 244 |
+
* unsigned int pitch;
|
| 245 |
+
* unsigned int numChannels;
|
| 246 |
+
* struct cudaChannelFormatDesc channelDesc;
|
| 247 |
+
* unsigned int reserved[4];
|
| 248 |
+
* } cudaEglPlaneDesc;
|
| 249 |
+
* \endcode
|
| 250 |
+
|
| 251 |
+
*/
|
| 252 |
+
typedef struct cudaEglFrame_st {
|
| 253 |
+
union {
|
| 254 |
+
cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/
|
| 255 |
+
struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
|
| 256 |
+
} frame;
|
| 257 |
+
cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
|
| 258 |
+
unsigned int planeCount; /**< Number of planes */
|
| 259 |
+
cudaEglFrameType frameType; /**< Array or Pitch */
|
| 260 |
+
cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
|
| 261 |
+
} cudaEglFrame;
|
| 262 |
+
|
| 263 |
+
/**
|
| 264 |
+
* CUDA EGLSream Connection
|
| 265 |
+
*/
|
| 266 |
+
typedef struct CUeglStreamConnection_st *cudaEglStreamConnection;
|
| 267 |
+
|
| 268 |
+
/** @} */ /* END CUDART_TYPES */
|
| 269 |
+
|
| 270 |
+
/**
|
| 271 |
+
* \addtogroup CUDART_EGL EGL Interoperability
|
| 272 |
+
* This section describes the EGL interoperability functions of the CUDA
|
| 273 |
+
* runtime application programming interface.
|
| 274 |
+
*
|
| 275 |
+
* @{
|
| 276 |
+
*/
|
| 277 |
+
|
| 278 |
+
/**
|
| 279 |
+
* \brief Registers an EGL image
|
| 280 |
+
*
|
| 281 |
+
* Registers the EGLImageKHR specified by \p image for access by
|
| 282 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 283 |
+
* Additional Mapping/Unmapping is not required for the registered resource and
|
| 284 |
+
* ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
|
| 285 |
+
*
|
| 286 |
+
* The application will be responsible for synchronizing access to shared objects.
|
| 287 |
+
* The application must ensure that any pending operation which access the objects have completed
|
| 288 |
+
* before passing control to CUDA. This may be accomplished by issuing and waiting for
|
| 289 |
+
* glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
|
| 290 |
+
* The application will be also responsible for ensuring that any pending operation on the
|
| 291 |
+
* registered CUDA resource has completed prior to executing subsequent commands in other APIs
|
| 292 |
+
* accesing the same memory objects.
|
| 293 |
+
* This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
|
| 294 |
+
*
|
| 295 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 296 |
+
*
|
| 297 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 298 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 299 |
+
* read from and written to by CUDA. This is the default value.
|
| 300 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 301 |
+
* will not write to this resource.
|
| 302 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 303 |
+
* CUDA will not read from this resource and will write over the
|
| 304 |
+
* entire contents of the resource, so none of the data previously
|
| 305 |
+
* stored in the resource will be preserved.
|
| 306 |
+
*
|
| 307 |
+
* The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
|
| 308 |
+
* typedef void* EGLImageKHR
|
| 309 |
+
*
|
| 310 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 311 |
+
* \param image - An EGLImageKHR image which can be used to create target resource.
|
| 312 |
+
* \param flags - Map flags
|
| 313 |
+
*
|
| 314 |
+
* \return
|
| 315 |
+
* ::cudaSuccess,
|
| 316 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 317 |
+
* ::cudaErrorInvalidValue,
|
| 318 |
+
* ::cudaErrorUnknown
|
| 319 |
+
*
|
| 320 |
+
* \sa
|
| 321 |
+
* ::cudaGraphicsUnregisterResource,
|
| 322 |
+
* ::cudaGraphicsResourceGetMappedEglFrame,
|
| 323 |
+
* ::cuGraphicsEGLRegisterImage
|
| 324 |
+
*/
|
| 325 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 326 |
+
|
| 327 |
+
/**
|
| 328 |
+
* \brief Connect CUDA to EGLStream as a consumer.
|
| 329 |
+
*
|
| 330 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
|
| 331 |
+
*
|
| 332 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 333 |
+
* API to another.
|
| 334 |
+
*
|
| 335 |
+
* \param conn - Pointer to the returned connection handle
|
| 336 |
+
* \param eglStream - EGLStreamKHR handle
|
| 337 |
+
*
|
| 338 |
+
* \return
|
| 339 |
+
* ::cudaSuccess,
|
| 340 |
+
* ::cudaErrorInvalidValue,
|
| 341 |
+
* ::cudaErrorUnknown
|
| 342 |
+
*
|
| 343 |
+
* \sa
|
| 344 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 345 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 346 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 347 |
+
* ::cuEGLStreamConsumerConnect
|
| 348 |
+
*/
|
| 349 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
|
| 350 |
+
|
| 351 |
+
/**
|
| 352 |
+
* \brief Connect CUDA to EGLStream as a consumer with given flags.
|
| 353 |
+
*
|
| 354 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
|
| 355 |
+
* ::cudaEglResourceLocationFlags.
|
| 356 |
+
*
|
| 357 |
+
* The flags specify whether the consumer wants to access frames from system memory or video memory.
|
| 358 |
+
* Default is ::cudaEglResourceLocationVidmem.
|
| 359 |
+
*
|
| 360 |
+
* \param conn - Pointer to the returned connection handle
|
| 361 |
+
* \param eglStream - EGLStreamKHR handle
|
| 362 |
+
* \param flags - Flags denote intended location - system or video.
|
| 363 |
+
*
|
| 364 |
+
* \return
|
| 365 |
+
* ::cudaSuccess,
|
| 366 |
+
* ::cudaErrorInvalidValue,
|
| 367 |
+
* ::cudaErrorUnknown
|
| 368 |
+
*
|
| 369 |
+
* \sa
|
| 370 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 371 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 372 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 373 |
+
* ::cuEGLStreamConsumerConnectWithFlags
|
| 374 |
+
*/
|
| 375 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
|
| 376 |
+
|
| 377 |
+
/**
|
| 378 |
+
* \brief Disconnect CUDA as a consumer to EGLStream .
|
| 379 |
+
*
|
| 380 |
+
* Disconnect CUDA as a consumer to EGLStreamKHR.
|
| 381 |
+
*
|
| 382 |
+
* \param conn - Conection to disconnect.
|
| 383 |
+
*
|
| 384 |
+
* \return
|
| 385 |
+
* ::cudaSuccess,
|
| 386 |
+
* ::cudaErrorInvalidValue,
|
| 387 |
+
* ::cudaErrorUnknown
|
| 388 |
+
*
|
| 389 |
+
* \sa
|
| 390 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 391 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 392 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 393 |
+
* ::cuEGLStreamConsumerDisconnect
|
| 394 |
+
*/
|
| 395 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
|
| 396 |
+
|
| 397 |
+
/**
|
| 398 |
+
* \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
|
| 399 |
+
*
|
| 400 |
+
* Acquire an image frame from EGLStreamKHR.
|
| 401 |
+
* ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
|
| 402 |
+
* ::cudaEglFrame.
|
| 403 |
+
*
|
| 404 |
+
* \param conn - Connection on which to acquire
|
| 405 |
+
* \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use.
|
| 406 |
+
* \param pStream - CUDA stream for synchronization and any data migrations
|
| 407 |
+
* implied by ::cudaEglResourceLocationFlags.
|
| 408 |
+
* \param timeout - Desired timeout in usec.
|
| 409 |
+
*
|
| 410 |
+
* \return
|
| 411 |
+
* ::cudaSuccess,
|
| 412 |
+
* ::cudaErrorInvalidValue,
|
| 413 |
+
* ::cudaErrorUnknown,
|
| 414 |
+
* ::cudaErrorLaunchTimeout
|
| 415 |
+
*
|
| 416 |
+
* \sa
|
| 417 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 418 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 419 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 420 |
+
* ::cuEGLStreamConsumerAcquireFrame
|
| 421 |
+
*/
|
| 422 |
+
|
| 423 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
|
| 424 |
+
cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
|
| 425 |
+
/**
|
| 426 |
+
* \brief Releases the last frame acquired from the EGLStream.
|
| 427 |
+
*
|
| 428 |
+
* Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
|
| 429 |
+
*
|
| 430 |
+
* \param conn - Connection on which to release
|
| 431 |
+
* \param pCudaResource - CUDA resource whose corresponding frame is to be released
|
| 432 |
+
* \param pStream - CUDA stream on which release will be done.
|
| 433 |
+
*
|
| 434 |
+
* \return
|
| 435 |
+
* ::cudaSuccess,
|
| 436 |
+
* ::cudaErrorInvalidValue,
|
| 437 |
+
* ::cudaErrorUnknown
|
| 438 |
+
*
|
| 439 |
+
* \sa
|
| 440 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 441 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 442 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 443 |
+
* ::cuEGLStreamConsumerReleaseFrame
|
| 444 |
+
*/
|
| 445 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
|
| 446 |
+
cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
|
| 447 |
+
|
| 448 |
+
/**
|
| 449 |
+
* \brief Connect CUDA to EGLStream as a producer.
|
| 450 |
+
*
|
| 451 |
+
* Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
|
| 452 |
+
*
|
| 453 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 454 |
+
* API to another.
|
| 455 |
+
*
|
| 456 |
+
* \param conn - Pointer to the returned connection handle
|
| 457 |
+
* \param eglStream - EGLStreamKHR handle
|
| 458 |
+
* \param width - width of the image to be submitted to the stream
|
| 459 |
+
* \param height - height of the image to be submitted to the stream
|
| 460 |
+
*
|
| 461 |
+
* \return
|
| 462 |
+
* ::cudaSuccess,
|
| 463 |
+
* ::cudaErrorInvalidValue,
|
| 464 |
+
* ::cudaErrorUnknown
|
| 465 |
+
*
|
| 466 |
+
* \sa
|
| 467 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 468 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 469 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 470 |
+
* ::cuEGLStreamProducerConnect
|
| 471 |
+
*/
|
| 472 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
|
| 473 |
+
EGLStreamKHR eglStream, EGLint width, EGLint height);
|
| 474 |
+
|
| 475 |
+
/**
|
| 476 |
+
* \brief Disconnect CUDA as a producer to EGLStream .
|
| 477 |
+
*
|
| 478 |
+
* Disconnect CUDA as a producer to EGLStreamKHR.
|
| 479 |
+
*
|
| 480 |
+
* \param conn - Conection to disconnect.
|
| 481 |
+
*
|
| 482 |
+
* \return
|
| 483 |
+
* ::cudaSuccess,
|
| 484 |
+
* ::cudaErrorInvalidValue,
|
| 485 |
+
* ::cudaErrorUnknown
|
| 486 |
+
*
|
| 487 |
+
* \sa
|
| 488 |
+
* ::cudaEGLStreamProducerConnect,
|
| 489 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 490 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 491 |
+
* ::cuEGLStreamProducerDisconnect
|
| 492 |
+
*/
|
| 493 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
|
| 494 |
+
|
| 495 |
+
/**
|
| 496 |
+
* \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
|
| 497 |
+
*
|
| 498 |
+
* The ::cudaEglFrame is defined as:
|
| 499 |
+
* \code
|
| 500 |
+
* typedef struct cudaEglFrame_st {
|
| 501 |
+
* union {
|
| 502 |
+
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
|
| 503 |
+
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
|
| 504 |
+
* } frame;
|
| 505 |
+
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
|
| 506 |
+
* unsigned int planeCount;
|
| 507 |
+
* cudaEglFrameType frameType;
|
| 508 |
+
* cudaEglColorFormat eglColorFormat;
|
| 509 |
+
* } cudaEglFrame;
|
| 510 |
+
* \endcode
|
| 511 |
+
*
|
| 512 |
+
* For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
|
| 513 |
+
* allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
|
| 514 |
+
* the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
|
| 515 |
+
*
|
| 516 |
+
* \param conn - Connection on which to present the CUDA array
|
| 517 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
|
| 518 |
+
* \param pStream - CUDA stream on which to present the frame.
|
| 519 |
+
*
|
| 520 |
+
* \return
|
| 521 |
+
* ::cudaSuccess,
|
| 522 |
+
* ::cudaErrorInvalidValue,
|
| 523 |
+
* ::cudaErrorUnknown
|
| 524 |
+
*
|
| 525 |
+
* \sa
|
| 526 |
+
* ::cudaEGLStreamProducerConnect,
|
| 527 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 528 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 529 |
+
* ::cuEGLStreamProducerPresentFrame
|
| 530 |
+
*/
|
| 531 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
|
| 532 |
+
cudaEglFrame eglframe, cudaStream_t *pStream);
|
| 533 |
+
|
| 534 |
+
/**
|
| 535 |
+
* \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
|
| 536 |
+
*
|
| 537 |
+
* This API can potentially return cudaErrorLaunchTimeout if the consumer has not
|
| 538 |
+
* returned a frame to EGL stream. If timeout is returned the application can retry.
|
| 539 |
+
*
|
| 540 |
+
* \param conn - Connection on which to present the CUDA array
|
| 541 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
|
| 542 |
+
* \param pStream - CUDA stream on which to return the frame.
|
| 543 |
+
*
|
| 544 |
+
* \return
|
| 545 |
+
* ::cudaSuccess,
|
| 546 |
+
* ::cudaErrorLaunchTimeout,
|
| 547 |
+
* ::cudaErrorInvalidValue,
|
| 548 |
+
* ::cudaErrorUnknown
|
| 549 |
+
*
|
| 550 |
+
* \sa
|
| 551 |
+
* ::cudaEGLStreamProducerConnect,
|
| 552 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 553 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 554 |
+
* ::cuEGLStreamProducerReturnFrame
|
| 555 |
+
*/
|
| 556 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
|
| 557 |
+
cudaEglFrame *eglframe, cudaStream_t *pStream);
|
| 558 |
+
|
| 559 |
+
/**
|
| 560 |
+
* \brief Get an eglFrame through which to access a registered EGL graphics resource.
|
| 561 |
+
*
|
| 562 |
+
* Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
|
| 563 |
+
* \p resource may be accessed.
|
| 564 |
+
* This API can only be called for EGL graphics resources.
|
| 565 |
+
*
|
| 566 |
+
* The ::cudaEglFrame is defined as
|
| 567 |
+
* \code
|
| 568 |
+
* typedef struct cudaEglFrame_st {
|
| 569 |
+
* union {
|
| 570 |
+
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
|
| 571 |
+
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
|
| 572 |
+
* } frame;
|
| 573 |
+
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
|
| 574 |
+
* unsigned int planeCount;
|
| 575 |
+
* cudaEglFrameType frameType;
|
| 576 |
+
* cudaEglColorFormat eglColorFormat;
|
| 577 |
+
* } cudaEglFrame;
|
| 578 |
+
* \endcode
|
| 579 |
+
*
|
| 580 |
+
*
|
| 581 |
+
* \param eglFrame - Returned eglFrame.
|
| 582 |
+
* \param resource - Registered resource to access.
|
| 583 |
+
* \param index - Index for cubemap surfaces.
|
| 584 |
+
* \param mipLevel - Mipmap level for the subresource to access.
|
| 585 |
+
*
|
| 586 |
+
* \return
|
| 587 |
+
* ::cudaSuccess,
|
| 588 |
+
* ::cudaErrorInvalidValue,
|
| 589 |
+
* ::cudaErrorUnknown
|
| 590 |
+
*
|
| 591 |
+
* \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
|
| 592 |
+
*
|
| 593 |
+
* \sa
|
| 594 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 595 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 596 |
+
* ::cuGraphicsResourceGetMappedEglFrame
|
| 597 |
+
*/
|
| 598 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
|
| 599 |
+
cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
|
| 600 |
+
|
| 601 |
+
/**
|
| 602 |
+
* \brief Creates an event from EGLSync object
|
| 603 |
+
*
|
| 604 |
+
* Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
|
| 605 |
+
* via \p flags. Valid flags include:
|
| 606 |
+
* - ::cudaEventDefault: Default event creation flag.
|
| 607 |
+
* - ::cudaEventBlockingSync: Specifies that the created event should use blocking
|
| 608 |
+
* synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on
|
| 609 |
+
* an event created with this flag will block until the event has actually
|
| 610 |
+
* been completed.
|
| 611 |
+
*
|
| 612 |
+
* ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
|
| 613 |
+
*
|
| 614 |
+
* The EGLSyncKHR is an opaque handle to an EGL sync object.
|
| 615 |
+
* typedef void* EGLSyncKHR
|
| 616 |
+
*
|
| 617 |
+
* \param phEvent - Returns newly created event
|
| 618 |
+
* \param eglSync - Opaque handle to EGLSync object
|
| 619 |
+
* \param flags - Event creation flags
|
| 620 |
+
*
|
| 621 |
+
* \return
|
| 622 |
+
* ::cudaSuccess,
|
| 623 |
+
* ::cudaErrorInitializationError,
|
| 624 |
+
* ::cudaErrorInvalidValue,
|
| 625 |
+
* ::cudaErrorLaunchFailure,
|
| 626 |
+
* ::cudaErrorMemoryAllocation
|
| 627 |
+
*
|
| 628 |
+
* \sa
|
| 629 |
+
* ::cudaEventQuery,
|
| 630 |
+
* ::cudaEventSynchronize,
|
| 631 |
+
* ::cudaEventDestroy
|
| 632 |
+
*/
|
| 633 |
+
extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 634 |
+
|
| 635 |
+
/** @} */ /* END CUDART_EGL */
|
| 636 |
+
|
| 637 |
+
#if defined(__cplusplus)
|
| 638 |
+
}
|
| 639 |
+
#endif /* __cplusplus */
|
| 640 |
+
|
| 641 |
+
#endif /* __CUDA_EGL_INTEROP_H__ */
|
| 642 |
+
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef __CUDA_FP8_H__
|
| 51 |
+
#define __CUDA_FP8_H__
|
| 52 |
+
|
| 53 |
+
/* Set up function decorations */
|
| 54 |
+
#if defined(__CUDACC__)
|
| 55 |
+
#define __CUDA_FP8_DECL__ static __device__ __inline__
|
| 56 |
+
#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
|
| 57 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
|
| 58 |
+
#else /* !defined(__CUDACC__) */
|
| 59 |
+
#if defined(__GNUC__)
|
| 60 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
|
| 61 |
+
#else
|
| 62 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static
|
| 63 |
+
#endif /* defined(__GNUC__) */
|
| 64 |
+
#define __CUDA_HOSTDEVICE_FP8__
|
| 65 |
+
#endif /* defined(__CUDACC_) */
|
| 66 |
+
|
| 67 |
+
#if !defined(_MSC_VER) && __cplusplus >= 201103L
|
| 68 |
+
#define __CPP_VERSION_AT_LEAST_11_FP8
|
| 69 |
+
#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
|
| 70 |
+
#define __CPP_VERSION_AT_LEAST_11_FP8
|
| 71 |
+
#endif
|
| 72 |
+
|
| 73 |
+
/* bring in __half_raw data type */
|
| 74 |
+
#include "cuda_fp16.h"
|
| 75 |
+
/* bring in __nv_bfloat16_raw data type */
|
| 76 |
+
#include "cuda_bf16.h"
|
| 77 |
+
/* bring in float2, double4, etc vector types */
|
| 78 |
+
#include "vector_types.h"
|
| 79 |
+
|
| 80 |
+
/**
|
| 81 |
+
* \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
|
| 82 |
+
* This section describes fp8 intrinsic functions.
|
| 83 |
+
* To use these functions, include the header file \p cuda_fp8.h in your
|
| 84 |
+
* program.
|
| 85 |
+
* The following macros are available to help users selectively enable/disable
|
| 86 |
+
* various definitions present in the header file:
|
| 87 |
+
* - \p __CUDA_NO_FP8_CONVERSIONS__ - If defined, this macro will prevent any
|
| 88 |
+
* use of the C++ type conversions (converting constructors and conversion
|
| 89 |
+
* operators) defined in the header.
|
| 90 |
+
* - \p __CUDA_NO_FP8_CONVERSION_OPERATORS__ - If defined, this macro will
|
| 91 |
+
* prevent any use of the C++ conversion operators from \p fp8 to other types.
|
| 92 |
+
*/
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
|
| 96 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 97 |
+
* To use these functions, include the header file \p cuda_fp8.h in your
|
| 98 |
+
* program.
|
| 99 |
+
*/
|
| 100 |
+
|
| 101 |
+
/**
|
| 102 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 103 |
+
* \brief 8-bit \p unsigned \p integer
|
| 104 |
+
* type abstraction used to for \p fp8 floating-point
|
| 105 |
+
* numbers storage.
|
| 106 |
+
*/
|
| 107 |
+
typedef unsigned char __nv_fp8_storage_t;
|
| 108 |
+
|
| 109 |
+
/**
|
| 110 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 111 |
+
* \brief 16-bit \p unsigned \p integer
|
| 112 |
+
* type abstraction used to for storage of pairs of
|
| 113 |
+
* \p fp8 floating-point numbers.
|
| 114 |
+
*/
|
| 115 |
+
typedef unsigned short int __nv_fp8x2_storage_t;
|
| 116 |
+
|
| 117 |
+
/**
|
| 118 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 119 |
+
* \brief 32-bit \p unsigned \p integer
|
| 120 |
+
* type abstraction used to for storage of tetrads of
|
| 121 |
+
* \p fp8 floating-point numbers.
|
| 122 |
+
*/
|
| 123 |
+
typedef unsigned int __nv_fp8x4_storage_t;
|
| 124 |
+
|
| 125 |
+
/**
|
| 126 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 127 |
+
* \brief Enumerates the modes applicable when
|
| 128 |
+
* performing a narrowing conversion to \p fp8 destination types.
|
| 129 |
+
*/
|
| 130 |
+
typedef enum __nv_saturation_t {
|
| 131 |
+
/**
|
| 132 |
+
* Means no saturation to finite is performed when conversion
|
| 133 |
+
* results in rounding values outside the range of destination
|
| 134 |
+
* type.
|
| 135 |
+
* NOTE: for fp8 type of e4m3 kind, the results that are larger
|
| 136 |
+
* than the maximum representable finite number of the target
|
| 137 |
+
* format become NaN.
|
| 138 |
+
*/
|
| 139 |
+
__NV_NOSAT,
|
| 140 |
+
/**
|
| 141 |
+
* Means input larger than the maximum representable
|
| 142 |
+
* finite number MAXNORM of the target format round to the
|
| 143 |
+
* MAXNORM of the same sign as input.
|
| 144 |
+
*/
|
| 145 |
+
__NV_SATFINITE,
|
| 146 |
+
} __nv_saturation_t;
|
| 147 |
+
|
| 148 |
+
/**
|
| 149 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 150 |
+
* \brief Enumerates the possible
|
| 151 |
+
* interpretations of the 8-bit values when referring to them as
|
| 152 |
+
* \p fp8 types.
|
| 153 |
+
*/
|
| 154 |
+
typedef enum __nv_fp8_interpretation_t {
|
| 155 |
+
__NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
|
| 156 |
+
__NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
|
| 157 |
+
} __nv_fp8_interpretation_t;
|
| 158 |
+
|
| 159 |
+
/* Forward-declaration of C-style APIs */
|
| 160 |
+
|
| 161 |
+
/**
|
| 162 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 163 |
+
* \brief Converts input \p double precision \p x to \p fp8 type of the
|
| 164 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 165 |
+
* mode.
|
| 166 |
+
*
|
| 167 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 168 |
+
* \p fp8_interpretation parameter,
|
| 169 |
+
* using round-to-nearest-even rounding and
|
| 170 |
+
* saturation mode specified by \p saturate parameter.
|
| 171 |
+
*
|
| 172 |
+
* \returns
|
| 173 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 174 |
+
*/
|
| 175 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 176 |
+
__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
|
| 177 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 178 |
+
|
| 179 |
+
/**
|
| 180 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 181 |
+
* \brief Converts input vector of two \p double precision numbers packed
|
| 182 |
+
* in \p double2 \p x into a vector of two values of \p fp8 type of
|
| 183 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 184 |
+
* saturation mode.
|
| 185 |
+
*
|
| 186 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 187 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 188 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 189 |
+
* parameter.
|
| 190 |
+
*
|
| 191 |
+
* \returns
|
| 192 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 193 |
+
*/
|
| 194 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 195 |
+
__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
|
| 196 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 197 |
+
|
| 198 |
+
/**
|
| 199 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 200 |
+
* \brief Converts input \p single precision \p x to \p fp8 type of the
|
| 201 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 202 |
+
* mode.
|
| 203 |
+
*
|
| 204 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 205 |
+
* \p fp8_interpretation parameter,
|
| 206 |
+
* using round-to-nearest-even rounding and
|
| 207 |
+
* saturation mode specified by \p saturate parameter.
|
| 208 |
+
*
|
| 209 |
+
* \returns
|
| 210 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 211 |
+
*/
|
| 212 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 213 |
+
__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
|
| 214 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 215 |
+
|
| 216 |
+
/**
|
| 217 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 218 |
+
* \brief Converts input vector of two \p single precision numbers packed
|
| 219 |
+
* in \p float2 \p x into a vector of two values of \p fp8 type of
|
| 220 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 221 |
+
* saturation mode.
|
| 222 |
+
*
|
| 223 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 224 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 225 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 226 |
+
* parameter.
|
| 227 |
+
*
|
| 228 |
+
* \returns
|
| 229 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 230 |
+
*/
|
| 231 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 232 |
+
__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
|
| 233 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 234 |
+
|
| 235 |
+
/**
|
| 236 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 237 |
+
* \brief Converts input \p half precision \p x to \p fp8 type of the requested
|
| 238 |
+
* kind using round-to-nearest-even rounding and requested saturation mode.
|
| 239 |
+
*
|
| 240 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 241 |
+
* \p fp8_interpretation parameter,
|
| 242 |
+
* using round-to-nearest-even rounding and
|
| 243 |
+
* saturation mode specified by \p saturate parameter.
|
| 244 |
+
*
|
| 245 |
+
* \returns
|
| 246 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 247 |
+
*/
|
| 248 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 249 |
+
__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
|
| 250 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 251 |
+
|
| 252 |
+
/**
|
| 253 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 254 |
+
* \brief Converts input vector of two \p half precision numbers packed
|
| 255 |
+
* in \p __half2_raw \p x into a vector of two values of \p fp8 type of
|
| 256 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 257 |
+
* saturation mode.
|
| 258 |
+
*
|
| 259 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 260 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 261 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 262 |
+
* parameter.
|
| 263 |
+
*
|
| 264 |
+
* \returns
|
| 265 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 266 |
+
*/
|
| 267 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
|
| 268 |
+
const __half2_raw x, const __nv_saturation_t saturate,
|
| 269 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 270 |
+
|
| 271 |
+
/**
|
| 272 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 273 |
+
* \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
|
| 274 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 275 |
+
* mode.
|
| 276 |
+
*
|
| 277 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 278 |
+
* \p fp8_interpretation parameter,
|
| 279 |
+
* using round-to-nearest-even rounding and
|
| 280 |
+
* saturation mode specified by \p saturate parameter.
|
| 281 |
+
*
|
| 282 |
+
* \returns
|
| 283 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 284 |
+
*/
|
| 285 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
|
| 286 |
+
const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
|
| 287 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 291 |
+
* \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
|
| 292 |
+
* in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
|
| 293 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 294 |
+
* saturation mode.
|
| 295 |
+
*
|
| 296 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 297 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 298 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 299 |
+
* parameter.
|
| 300 |
+
*
|
| 301 |
+
* \returns
|
| 302 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 303 |
+
*/
|
| 304 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 305 |
+
__nv_cvt_bfloat16raw2_to_fp8x2(
|
| 306 |
+
const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
|
| 307 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 308 |
+
|
| 309 |
+
/**
|
| 310 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 311 |
+
* \brief Converts input \p fp8 \p x of the specified kind
|
| 312 |
+
* to \p half precision.
|
| 313 |
+
*
|
| 314 |
+
* \details Converts input \p x of \p fp8 type of the kind specified by
|
| 315 |
+
* \p fp8_interpretation parameter
|
| 316 |
+
* to \p half precision.
|
| 317 |
+
*
|
| 318 |
+
* \returns
|
| 319 |
+
* - The \p __half_raw value holds the result of conversion.
|
| 320 |
+
*/
|
| 321 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
|
| 322 |
+
__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
|
| 323 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 324 |
+
/**
|
| 325 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 326 |
+
* \brief Converts input vector of two \p fp8 values of the specified kind
|
| 327 |
+
* to a vector of two \p half precision values packed in \p __half2_raw
|
| 328 |
+
* structure.
|
| 329 |
+
*
|
| 330 |
+
* \details Converts input vector \p x of \p fp8 type of the kind specified by
|
| 331 |
+
* \p fp8_interpretation parameter
|
| 332 |
+
* to a vector of two \p half precision values and returns as \p __half2_raw
|
| 333 |
+
* structure.
|
| 334 |
+
*
|
| 335 |
+
* \returns
|
| 336 |
+
* - The \p __half2_raw value holds the result of conversion.
|
| 337 |
+
*/
|
| 338 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 339 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 340 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 341 |
+
|
| 342 |
+
#if defined(__cplusplus)
|
| 343 |
+
|
| 344 |
+
#define __CUDA_FP8_TYPES_EXIST__
|
| 345 |
+
|
| 346 |
+
/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
|
| 347 |
+
struct __nv_fp8_e5m2;
|
| 348 |
+
struct __nv_fp8x2_e5m2;
|
| 349 |
+
struct __nv_fp8x4_e5m2;
|
| 350 |
+
|
| 351 |
+
struct __nv_fp8_e4m3;
|
| 352 |
+
struct __nv_fp8x2_e4m3;
|
| 353 |
+
struct __nv_fp8x4_e4m3;
|
| 354 |
+
|
| 355 |
+
#endif /* defined(__cplusplus) */
|
| 356 |
+
|
| 357 |
+
#include "cuda_fp8.hpp"
|
| 358 |
+
|
| 359 |
+
#undef __CUDA_FP8_DECL__
|
| 360 |
+
#undef __CUDA_HOSTDEVICE_FP8__
|
| 361 |
+
#undef __CUDA_HOSTDEVICE_FP8_DECL__
|
| 362 |
+
|
| 363 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 364 |
+
#undef __CPP_VERSION_AT_LEAST_11_FP8
|
| 365 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 366 |
+
|
| 367 |
+
#endif /* end of include guard: __CUDA_FP8_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp
ADDED
|
@@ -0,0 +1,1750 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_FP8_HPP__)
|
| 51 |
+
#define __CUDA_FP8_HPP__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_FP8_H__)
|
| 54 |
+
#error "Do not include this file directly. Instead, include cuda_fp8.h."
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
/* C++ header for std::memcpy (used for type punning in host-side
|
| 58 |
+
* implementations). When compiling as a CUDA source file memcpy is provided
|
| 59 |
+
* implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
|
| 60 |
+
*/
|
| 61 |
+
#if defined(__cplusplus) && !defined(__CUDACC__)
|
| 62 |
+
#include <cstring>
|
| 63 |
+
#elif !defined(__cplusplus) && !defined(__CUDACC__)
|
| 64 |
+
#include <string.h>
|
| 65 |
+
#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
|
| 66 |
+
|
| 67 |
+
/* Set up structure-alignment attribute */
|
| 68 |
+
#if !(defined __CUDA_ALIGN__)
|
| 69 |
+
#if defined(__CUDACC__)
|
| 70 |
+
#define __CUDA_ALIGN__(align) __align__(align)
|
| 71 |
+
#else
|
| 72 |
+
/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
|
| 73 |
+
* is available) */
|
| 74 |
+
#if __cplusplus >= 201103L
|
| 75 |
+
#define __CUDA_ALIGN__(n) \
|
| 76 |
+
alignas(n) /* C++11 kindly gives us a keyword for this */
|
| 77 |
+
#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
|
| 78 |
+
#if defined(__GNUC__)
|
| 79 |
+
#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
|
| 80 |
+
#elif defined(_MSC_VER)
|
| 81 |
+
#define __CUDA_ALIGN__(n) __declspec(align(n))
|
| 82 |
+
#else
|
| 83 |
+
#define __CUDA_ALIGN__(n)
|
| 84 |
+
#endif /* defined(__GNUC__) */
|
| 85 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 86 |
+
#endif /* defined(__CUDACC__) */
|
| 87 |
+
#endif /* !(defined __CUDA_ALIGN__) */
|
| 88 |
+
|
| 89 |
+
#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
|
| 90 |
+
/* need c++11 for explicit operators */
|
| 91 |
+
#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
|
| 92 |
+
#endif
|
| 93 |
+
|
| 94 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 95 |
+
__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
|
| 96 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 97 |
+
unsigned char res;
|
| 98 |
+
unsigned long long int xbits;
|
| 99 |
+
|
| 100 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 101 |
+
(void)memcpy(&xbits, &x, sizeof(x));
|
| 102 |
+
#else
|
| 103 |
+
(void)std::memcpy(&xbits, &x, sizeof(x));
|
| 104 |
+
#endif
|
| 105 |
+
unsigned char FP8_MAXNORM;
|
| 106 |
+
unsigned char FP8_MANTISSA_MASK;
|
| 107 |
+
unsigned short int FP8_EXP_BIAS;
|
| 108 |
+
unsigned long long int FP8_SIGNIFICAND_BITS;
|
| 109 |
+
const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
|
| 110 |
+
unsigned long long int FP8_MINDENORM_O2;
|
| 111 |
+
unsigned long long int FP8_OVERFLOW_THRESHOLD;
|
| 112 |
+
unsigned long long int FP8_MINNORM;
|
| 113 |
+
|
| 114 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 115 |
+
FP8_EXP_BIAS = 7U;
|
| 116 |
+
FP8_SIGNIFICAND_BITS = 4ULL;
|
| 117 |
+
FP8_MANTISSA_MASK = 0x7U;
|
| 118 |
+
FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
|
| 119 |
+
FP8_OVERFLOW_THRESHOLD =
|
| 120 |
+
0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
|
| 121 |
+
FP8_MAXNORM = 0x7EU;
|
| 122 |
+
FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
|
| 123 |
+
} else { //__NV_E5M2
|
| 124 |
+
FP8_EXP_BIAS = 15U;
|
| 125 |
+
FP8_SIGNIFICAND_BITS = 3ULL;
|
| 126 |
+
FP8_MANTISSA_MASK = 0x3U;
|
| 127 |
+
FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
|
| 128 |
+
FP8_OVERFLOW_THRESHOLD =
|
| 129 |
+
0x40EE000000000000ULL -
|
| 130 |
+
1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
|
| 131 |
+
FP8_MAXNORM = 0x7BU;
|
| 132 |
+
FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
// 1/2 LSB of the target format, positioned in double precision mantissa
|
| 136 |
+
// helpful in midpoints detection during round-to-nearest-even step
|
| 137 |
+
const unsigned long long int FP8_DP_HALF_ULP =
|
| 138 |
+
(unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
|
| 139 |
+
// prepare sign bit in target format
|
| 140 |
+
unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
|
| 141 |
+
// prepare exponent field in target format
|
| 142 |
+
unsigned char exp =
|
| 143 |
+
(unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
|
| 144 |
+
1023U + FP8_EXP_BIAS);
|
| 145 |
+
// round mantissa to target format width, rounding towards zero
|
| 146 |
+
unsigned char mantissa =
|
| 147 |
+
(unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
|
| 148 |
+
FP8_MANTISSA_MASK;
|
| 149 |
+
unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
|
| 150 |
+
|
| 151 |
+
if (absx <= FP8_MINDENORM_O2) {
|
| 152 |
+
// zero or underflow
|
| 153 |
+
res = 0U;
|
| 154 |
+
} else if (absx > DP_INF_BITS) {
|
| 155 |
+
// NaN
|
| 156 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 157 |
+
res = 0x7FU;
|
| 158 |
+
} else {
|
| 159 |
+
// NaN --> QNaN
|
| 160 |
+
res = 0x7EU | mantissa;
|
| 161 |
+
}
|
| 162 |
+
} else if (absx > FP8_OVERFLOW_THRESHOLD) {
|
| 163 |
+
if (saturate == __NV_SATFINITE) {
|
| 164 |
+
res = FP8_MAXNORM;
|
| 165 |
+
} else {
|
| 166 |
+
// __NV_NOSAT
|
| 167 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 168 |
+
// no Inf in E4M3
|
| 169 |
+
res = 0x7FU; // NaN
|
| 170 |
+
} else {
|
| 171 |
+
res = 0x7CU; // Inf in E5M2
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
} else if (absx >= FP8_MINNORM) {
|
| 175 |
+
res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
|
| 176 |
+
// rounded-off bits
|
| 177 |
+
unsigned long long int round =
|
| 178 |
+
xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
|
| 179 |
+
// round-to-nearest-even adjustment
|
| 180 |
+
if ((round > FP8_DP_HALF_ULP) ||
|
| 181 |
+
((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
|
| 182 |
+
res = (unsigned char)(res + 1U);
|
| 183 |
+
}
|
| 184 |
+
} else // Denormal range
|
| 185 |
+
{
|
| 186 |
+
unsigned char shift = (unsigned char)(1U - exp);
|
| 187 |
+
// add implicit leading bit
|
| 188 |
+
mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
|
| 189 |
+
// additional round-off due to denormalization
|
| 190 |
+
res = (unsigned char)(mantissa >> shift);
|
| 191 |
+
|
| 192 |
+
// rounded-off bits, including implicit leading bit
|
| 193 |
+
unsigned long long int round =
|
| 194 |
+
(xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
|
| 195 |
+
((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
|
| 196 |
+
// round-to-nearest-even adjustment
|
| 197 |
+
if ((round > (FP8_DP_HALF_ULP << shift)) ||
|
| 198 |
+
((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
|
| 199 |
+
res = (unsigned char)(res + 1U);
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
res |= sign;
|
| 204 |
+
|
| 205 |
+
return (__nv_fp8_storage_t)res;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 209 |
+
__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
|
| 210 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 211 |
+
__nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
|
| 212 |
+
x.y, saturate, fp8_interpretation);
|
| 213 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 214 |
+
storage = (__nv_fp8x2_storage_t)(storage |
|
| 215 |
+
__nv_cvt_double_to_fp8(
|
| 216 |
+
x.x, saturate, fp8_interpretation));
|
| 217 |
+
return storage;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 221 |
+
__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
|
| 222 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 223 |
+
__nv_fp8_storage_t res = 0U;
|
| 224 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 225 |
+
if (saturate == __NV_SATFINITE) {
|
| 226 |
+
__nv_fp8x2_storage_t storage;
|
| 227 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 228 |
+
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
|
| 229 |
+
: "=h"(storage)
|
| 230 |
+
: "f"(x), "f"(0.0f));
|
| 231 |
+
} else {
|
| 232 |
+
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
|
| 233 |
+
: "=h"(storage)
|
| 234 |
+
: "f"(x), "f"(0.0f));
|
| 235 |
+
}
|
| 236 |
+
res = (__nv_fp8_storage_t)storage;
|
| 237 |
+
} else
|
| 238 |
+
#endif
|
| 239 |
+
{
|
| 240 |
+
unsigned int xbits;
|
| 241 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 242 |
+
(void)memcpy(&xbits, &x, sizeof(x));
|
| 243 |
+
#else
|
| 244 |
+
(void)std::memcpy(&xbits, &x, sizeof(x));
|
| 245 |
+
#endif
|
| 246 |
+
|
| 247 |
+
// isnan
|
| 248 |
+
if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
|
| 249 |
+
// Canonical NaN
|
| 250 |
+
xbits = 0x7FFFFFFFU;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
float fx;
|
| 254 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 255 |
+
(void)memcpy(&fx, &xbits, sizeof(xbits));
|
| 256 |
+
#else
|
| 257 |
+
(void)std::memcpy(&fx, &xbits, sizeof(xbits));
|
| 258 |
+
#endif
|
| 259 |
+
|
| 260 |
+
const double dx = (double)fx;
|
| 261 |
+
res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
|
| 262 |
+
}
|
| 263 |
+
return res;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 267 |
+
__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
|
| 268 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 269 |
+
__nv_fp8x2_storage_t storage;
|
| 270 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 271 |
+
if (saturate == __NV_SATFINITE) {
|
| 272 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 273 |
+
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
|
| 274 |
+
: "=h"(storage)
|
| 275 |
+
: "f"(x.x), "f"(x.y));
|
| 276 |
+
} else {
|
| 277 |
+
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
|
| 278 |
+
: "=h"(storage)
|
| 279 |
+
: "f"(x.x), "f"(x.y));
|
| 280 |
+
}
|
| 281 |
+
} else
|
| 282 |
+
#endif
|
| 283 |
+
{
|
| 284 |
+
storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
|
| 285 |
+
x.y, saturate, fp8_interpretation);
|
| 286 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 287 |
+
storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
|
| 288 |
+
x.x, saturate,
|
| 289 |
+
fp8_interpretation));
|
| 290 |
+
}
|
| 291 |
+
return storage;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float
|
| 295 |
+
__internal_halfraw_to_float(const __half_raw x) {
|
| 296 |
+
float f;
|
| 297 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
|
| 298 |
+
asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
|
| 299 |
+
#else
|
| 300 |
+
const unsigned int ux = (unsigned int)x.x;
|
| 301 |
+
unsigned int sign = (ux >> 15U) & 1U;
|
| 302 |
+
unsigned int exponent = (ux >> 10U) & 0x1fU;
|
| 303 |
+
unsigned int mantissa = (ux & 0x3ffU) << 13U;
|
| 304 |
+
if (exponent == 0x1fU) { /* NaN or Inf */
|
| 305 |
+
/* discard sign of a NaN */
|
| 306 |
+
sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
|
| 307 |
+
mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
|
| 308 |
+
exponent = 0xffU;
|
| 309 |
+
} else if (exponent == 0U) { /* Denorm or Zero */
|
| 310 |
+
if (mantissa != 0U) {
|
| 311 |
+
unsigned int msb;
|
| 312 |
+
exponent = 0x71U;
|
| 313 |
+
do {
|
| 314 |
+
msb = (mantissa & 0x400000U);
|
| 315 |
+
mantissa <<= 1U; /* normalize */
|
| 316 |
+
--exponent;
|
| 317 |
+
} while (msb == 0U);
|
| 318 |
+
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
| 319 |
+
}
|
| 320 |
+
} else {
|
| 321 |
+
exponent += 0x70U;
|
| 322 |
+
}
|
| 323 |
+
const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
|
| 324 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 325 |
+
(void)memcpy(&f, &u, sizeof(u));
|
| 326 |
+
#else
|
| 327 |
+
(void)std::memcpy(&f, &u, sizeof(u));
|
| 328 |
+
#endif
|
| 329 |
+
#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
|
| 330 |
+
return f;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float2
|
| 334 |
+
__internal_halfraw2_to_float2(const __half2_raw x) {
|
| 335 |
+
__half_raw raw;
|
| 336 |
+
float2 res;
|
| 337 |
+
raw.x = x.x;
|
| 338 |
+
res.x = __internal_halfraw_to_float(raw);
|
| 339 |
+
raw.x = x.y;
|
| 340 |
+
res.y = __internal_halfraw_to_float(raw);
|
| 341 |
+
return res;
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 345 |
+
__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
|
| 346 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 347 |
+
__nv_fp8_storage_t res = 0U;
|
| 348 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 349 |
+
if (saturate == __NV_SATFINITE) {
|
| 350 |
+
unsigned int half2_storage = (unsigned int)(x.x);
|
| 351 |
+
__nv_fp8x2_storage_t tmp;
|
| 352 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 353 |
+
asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
|
| 354 |
+
: "=h"(tmp)
|
| 355 |
+
: "r"(half2_storage));
|
| 356 |
+
} else {
|
| 357 |
+
asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
|
| 358 |
+
: "=h"(tmp)
|
| 359 |
+
: "r"(half2_storage));
|
| 360 |
+
}
|
| 361 |
+
res = (__nv_fp8_storage_t)tmp;
|
| 362 |
+
} else
|
| 363 |
+
#endif
|
| 364 |
+
{
|
| 365 |
+
float fx = __internal_halfraw_to_float(x);
|
| 366 |
+
res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
|
| 367 |
+
}
|
| 368 |
+
return res;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
|
| 372 |
+
const __half2_raw x, const __nv_saturation_t saturate,
|
| 373 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 374 |
+
__nv_fp8x2_storage_t tmp;
|
| 375 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 376 |
+
if (saturate == __NV_SATFINITE) {
|
| 377 |
+
unsigned int half2_storage;
|
| 378 |
+
(void)memcpy(&half2_storage, &x, sizeof(x));
|
| 379 |
+
|
| 380 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 381 |
+
asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
|
| 382 |
+
: "=h"(tmp)
|
| 383 |
+
: "r"(half2_storage));
|
| 384 |
+
} else {
|
| 385 |
+
asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
|
| 386 |
+
: "=h"(tmp)
|
| 387 |
+
: "r"(half2_storage));
|
| 388 |
+
}
|
| 389 |
+
} else
|
| 390 |
+
#endif
|
| 391 |
+
{
|
| 392 |
+
__half_raw raw;
|
| 393 |
+
raw.x = x.x;
|
| 394 |
+
__nv_fp8_storage_t lo =
|
| 395 |
+
__nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
|
| 396 |
+
raw.x = x.y;
|
| 397 |
+
__nv_fp8_storage_t hi =
|
| 398 |
+
__nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
|
| 399 |
+
tmp = hi;
|
| 400 |
+
tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
|
| 401 |
+
tmp = (__nv_fp8x2_storage_t)(tmp | lo);
|
| 402 |
+
}
|
| 403 |
+
return tmp;
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float
|
| 407 |
+
__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
|
| 408 |
+
const unsigned int ux = ((unsigned int)x.x) << 16U;
|
| 409 |
+
float fx;
|
| 410 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 411 |
+
(void)memcpy(&fx, &ux, sizeof(ux));
|
| 412 |
+
#else
|
| 413 |
+
(void)std::memcpy(&fx, &ux, sizeof(ux));
|
| 414 |
+
#endif
|
| 415 |
+
return fx;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
|
| 419 |
+
__internal_float_to_bf16raw_rz(const float x) {
|
| 420 |
+
unsigned int ux;
|
| 421 |
+
__nv_bfloat16_raw r;
|
| 422 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 423 |
+
(void)memcpy(&ux, &x, sizeof(x));
|
| 424 |
+
#else
|
| 425 |
+
(void)std::memcpy(&ux, &x, sizeof(x));
|
| 426 |
+
#endif
|
| 427 |
+
r.x = (unsigned short int)(ux >> 16U);
|
| 428 |
+
return r;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
|
| 432 |
+
const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
|
| 433 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 434 |
+
const float fx = __internal_bf16raw_to_float(x);
|
| 435 |
+
const __nv_fp8_storage_t res =
|
| 436 |
+
__nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
|
| 437 |
+
return res;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 441 |
+
__nv_cvt_bfloat16raw2_to_fp8x2(
|
| 442 |
+
const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
|
| 443 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 444 |
+
__nv_bfloat16_raw raw;
|
| 445 |
+
raw.x = x.y;
|
| 446 |
+
__nv_fp8x2_storage_t storage =
|
| 447 |
+
(__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
|
| 448 |
+
fp8_interpretation);
|
| 449 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 450 |
+
raw.x = x.x;
|
| 451 |
+
storage = (__nv_fp8x2_storage_t)(storage |
|
| 452 |
+
__nv_cvt_bfloat16raw_to_fp8(
|
| 453 |
+
raw, saturate, fp8_interpretation));
|
| 454 |
+
return storage;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 458 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 459 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 460 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
|
| 461 |
+
__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
|
| 462 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 463 |
+
__half_raw res;
|
| 464 |
+
res.x = 0U;
|
| 465 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 466 |
+
res.x =
|
| 467 |
+
__nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
|
| 468 |
+
.x;
|
| 469 |
+
#else
|
| 470 |
+
unsigned short int ur = (unsigned short int)x;
|
| 471 |
+
ur = (unsigned short int)(ur << 8U);
|
| 472 |
+
|
| 473 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 474 |
+
if ((ur & 0x7FFFU) > 0x7C00U) {
|
| 475 |
+
/* If NaN, return canonical NaN */
|
| 476 |
+
ur = 0x7FFFU;
|
| 477 |
+
}
|
| 478 |
+
} else { // __NV_E4M3
|
| 479 |
+
unsigned short int sign = ur & 0x8000U;
|
| 480 |
+
unsigned short int exponent =
|
| 481 |
+
(unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
|
| 482 |
+
unsigned short int mantissa = (ur & 0x0700U) >> 1U;
|
| 483 |
+
unsigned char absx = 0x7FU & (unsigned char)x;
|
| 484 |
+
|
| 485 |
+
if (absx == 0x7FU) // NaN
|
| 486 |
+
{
|
| 487 |
+
ur = 0x7FFFU; // fp16 canonical NaN, discard sign
|
| 488 |
+
} else if (exponent == 0x2000U) {
|
| 489 |
+
// zero or denormal
|
| 490 |
+
if (mantissa != 0U) {
|
| 491 |
+
// normalize
|
| 492 |
+
mantissa = (unsigned short int)(mantissa << 1U);
|
| 493 |
+
while ((mantissa & 0x0400U) == 0U) {
|
| 494 |
+
mantissa = (unsigned short int)(mantissa << 1U);
|
| 495 |
+
exponent = (unsigned short int)(exponent - 0x0400U);
|
| 496 |
+
}
|
| 497 |
+
// discard implicit leading bit
|
| 498 |
+
mantissa &= 0x03FFU;
|
| 499 |
+
} else { // Zero
|
| 500 |
+
exponent = 0U;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
ur = (sign | exponent) | mantissa;
|
| 504 |
+
} else {
|
| 505 |
+
ur = (sign | exponent) | mantissa;
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
res.x = ur;
|
| 509 |
+
#endif
|
| 510 |
+
return res;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 514 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 515 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 516 |
+
__half2_raw res;
|
| 517 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
| 518 |
+
unsigned int half2_storage;
|
| 519 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 520 |
+
asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
|
| 521 |
+
} else {
|
| 522 |
+
asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
|
| 523 |
+
}
|
| 524 |
+
(void)memcpy(&res, &half2_storage, sizeof(half2_storage));
|
| 525 |
+
#else
|
| 526 |
+
res.x =
|
| 527 |
+
__nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
|
| 528 |
+
res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
|
| 529 |
+
fp8_interpretation)
|
| 530 |
+
.x;
|
| 531 |
+
#endif
|
| 532 |
+
return res;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
/* All other definitions in this file are only visible to C++ compilers */
|
| 536 |
+
#if defined(__cplusplus)
|
| 537 |
+
|
| 538 |
+
/**
|
| 539 |
+
* \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
|
| 540 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 541 |
+
*/
|
| 542 |
+
|
| 543 |
+
/**
|
| 544 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 545 |
+
* \brief __nv_fp8_e5m2 datatype
|
| 546 |
+
*
|
| 547 |
+
* \details This structure implements the datatype for handling
|
| 548 |
+
* \p fp8 floating-point numbers of \p e5m2 kind:
|
| 549 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 550 |
+
*
|
| 551 |
+
* The structure implements converting constructors and operators.
|
| 552 |
+
*/
|
| 553 |
+
struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
|
| 554 |
+
public:
|
| 555 |
+
/**
|
| 556 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 557 |
+
* Storage variable contains the \p fp8 floating-point data.
|
| 558 |
+
*/
|
| 559 |
+
__nv_fp8_storage_t __x;
|
| 560 |
+
|
| 561 |
+
/**
|
| 562 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 563 |
+
* Constructor by default.
|
| 564 |
+
*/
|
| 565 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 566 |
+
__nv_fp8_e5m2() = default;
|
| 567 |
+
#else
|
| 568 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
|
| 569 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 570 |
+
|
| 571 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 572 |
+
|
| 573 |
+
/* Construct from wider FP types */
|
| 574 |
+
/* Note we do avoid constructor init-list because of special host/device
|
| 575 |
+
* compilation rules */
|
| 576 |
+
|
| 577 |
+
/**
|
| 578 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 579 |
+
* Constructor from \p __half data type, relies on \p __NV_SATFINITE
|
| 580 |
+
* behavior for out-of-range values.
|
| 581 |
+
*/
|
| 582 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
|
| 583 |
+
__x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
|
| 584 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 585 |
+
}
|
| 586 |
+
/**
|
| 587 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 588 |
+
* Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
|
| 589 |
+
* behavior for out-of-range values.
|
| 590 |
+
*/
|
| 591 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
|
| 592 |
+
__x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
|
| 593 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 594 |
+
}
|
| 595 |
+
/**
|
| 596 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 597 |
+
* Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
|
| 598 |
+
* for out-of-range values.
|
| 599 |
+
*/
|
| 600 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
|
| 601 |
+
__x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
|
| 602 |
+
}
|
| 603 |
+
/**
|
| 604 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 605 |
+
* Constructor from \p double data type, relies on \p __NV_SATFINITE
|
| 606 |
+
* behavior for out-of-range values.
|
| 607 |
+
*/
|
| 608 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
|
| 609 |
+
__x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
/* Converts from integral */
|
| 613 |
+
|
| 614 |
+
/**
|
| 615 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 616 |
+
* Constructor from \p unsigned \p short \p int data type, relies on \p
|
| 617 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 618 |
+
*/
|
| 619 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 620 |
+
__nv_fp8_e5m2(const unsigned short int val) {
|
| 621 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 622 |
+
}
|
| 623 |
+
/**
|
| 624 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 625 |
+
* Constructor from \p unsigned \p int data type, relies on \p
|
| 626 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 627 |
+
*/
|
| 628 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
|
| 629 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 630 |
+
}
|
| 631 |
+
/**
|
| 632 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 633 |
+
* Constructor from \p unsigned \p long \p int data type, relies on \p
|
| 634 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 635 |
+
*/
|
| 636 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned long int val) {
|
| 637 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 638 |
+
}
|
| 639 |
+
/**
|
| 640 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 641 |
+
* Constructor from \p unsigned \p long \p long \p int data type, relies on
|
| 642 |
+
* \p __NV_SATFINITE behavior for out-of-range values.
|
| 643 |
+
*/
|
| 644 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 645 |
+
__nv_fp8_e5m2(const unsigned long long int val) {
|
| 646 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
/**
|
| 650 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 651 |
+
* Constructor from \p short \p int data type.
|
| 652 |
+
*/
|
| 653 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
|
| 654 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 655 |
+
}
|
| 656 |
+
/**
|
| 657 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 658 |
+
* Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
|
| 659 |
+
* for out-of-range values.
|
| 660 |
+
*/
|
| 661 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
|
| 662 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 663 |
+
}
|
| 664 |
+
/**
|
| 665 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 666 |
+
* Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
|
| 667 |
+
* for out-of-range values.
|
| 668 |
+
*/
|
| 669 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long int val) {
|
| 670 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 671 |
+
}
|
| 672 |
+
/**
|
| 673 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 674 |
+
* Constructor from \p long \p long \p int data type, relies on \p
|
| 675 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 676 |
+
*/
|
| 677 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
|
| 678 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 682 |
+
/* Widening FP converts */
|
| 683 |
+
/**
|
| 684 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 685 |
+
* Conversion operator to \p __half data type.
|
| 686 |
+
*/
|
| 687 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
|
| 688 |
+
return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
|
| 689 |
+
}
|
| 690 |
+
/**
|
| 691 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 692 |
+
* Conversion operator to \p float data type.
|
| 693 |
+
*/
|
| 694 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
|
| 695 |
+
return __internal_halfraw_to_float(
|
| 696 |
+
__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
|
| 697 |
+
}
|
| 698 |
+
/**
|
| 699 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 700 |
+
* Conversion operator to \p __nv_bfloat16 data type.
|
| 701 |
+
*/
|
| 702 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
|
| 703 |
+
return static_cast<__nv_bfloat16>(
|
| 704 |
+
__internal_float_to_bf16raw_rz(float(*this)));
|
| 705 |
+
}
|
| 706 |
+
/**
|
| 707 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 708 |
+
* Conversion operator to \p double data type.
|
| 709 |
+
*/
|
| 710 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
|
| 711 |
+
return static_cast<double>(float(*this));
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
/* Convert to integral */
|
| 715 |
+
|
| 716 |
+
/**
|
| 717 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 718 |
+
* Conversion operator to \p unsigned \p char data type.
|
| 719 |
+
* Clamps negative and too large inputs to the output range.
|
| 720 |
+
* \p NaN inputs convert to \p zero.
|
| 721 |
+
*/
|
| 722 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
|
| 723 |
+
unsigned char i;
|
| 724 |
+
const float f = float(*this);
|
| 725 |
+
const unsigned char max_val = 0xFFU;
|
| 726 |
+
const unsigned char min_val = 0U;
|
| 727 |
+
const unsigned char bits = (*this).__x;
|
| 728 |
+
// saturation fixup
|
| 729 |
+
if ((bits & 0x7FU) > 0x7CU) {
|
| 730 |
+
// NaN
|
| 731 |
+
i = 0;
|
| 732 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 733 |
+
// saturate maximum
|
| 734 |
+
i = max_val;
|
| 735 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 736 |
+
// saturate minimum
|
| 737 |
+
i = min_val;
|
| 738 |
+
} else {
|
| 739 |
+
// normal value
|
| 740 |
+
i = static_cast<unsigned char>(f);
|
| 741 |
+
}
|
| 742 |
+
return i;
|
| 743 |
+
}
|
| 744 |
+
/**
|
| 745 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 746 |
+
* Conversion operator to \p unsigned \p short \p int data type.
|
| 747 |
+
* Clamps negative and too large inputs to the output range.
|
| 748 |
+
* \p NaN inputs convert to \p zero.
|
| 749 |
+
*/
|
| 750 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
|
| 751 |
+
return __half2ushort_rz(__half(*this));
|
| 752 |
+
}
|
| 753 |
+
/**
|
| 754 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 755 |
+
* Conversion operator to \p unsigned \p int data type.
|
| 756 |
+
* Clamps negative and too large inputs to the output range.
|
| 757 |
+
* \p NaN inputs convert to \p zero.
|
| 758 |
+
*/
|
| 759 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
|
| 760 |
+
return __half2uint_rz(__half(*this));
|
| 761 |
+
}
|
| 762 |
+
/**
|
| 763 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 764 |
+
* Conversion operator to \p unsigned \p long \p int data type.
|
| 765 |
+
* Clamps negative and too large inputs to the output range.
|
| 766 |
+
* \p NaN inputs convert to \p zero if output type is 32-bit.
|
| 767 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
|
| 768 |
+
*/
|
| 769 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
|
| 770 |
+
unsigned long retval;
|
| 771 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 772 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 773 |
+
#pragma warning (push)
|
| 774 |
+
#pragma warning (disable: 4127)
|
| 775 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 776 |
+
if (sizeof(unsigned long) == sizeof(unsigned long long))
|
| 777 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 778 |
+
#pragma warning (pop)
|
| 779 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 780 |
+
{
|
| 781 |
+
retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
|
| 782 |
+
}
|
| 783 |
+
else
|
| 784 |
+
{
|
| 785 |
+
retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
|
| 786 |
+
}
|
| 787 |
+
return retval;
|
| 788 |
+
}
|
| 789 |
+
/**
|
| 790 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 791 |
+
* Conversion operator to \p unsigned \p long \p long \p int data type.
|
| 792 |
+
* Clamps negative and too large inputs to the output range.
|
| 793 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL.
|
| 794 |
+
*/
|
| 795 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
|
| 796 |
+
return __half2ull_rz(__half(*this));
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
/**
|
| 800 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 801 |
+
* Conversion operator to \p signed \p char data type.
|
| 802 |
+
* Clamps too large inputs to the output range.
|
| 803 |
+
* \p NaN inputs convert to \p zero.
|
| 804 |
+
*/
|
| 805 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
|
| 806 |
+
signed char i;
|
| 807 |
+
const float f = float(*this);
|
| 808 |
+
const signed char max_val = (signed char)0x7FU;
|
| 809 |
+
const signed char min_val = (signed char)0x80U;
|
| 810 |
+
const unsigned char bits = (*this).__x;
|
| 811 |
+
// saturation fixup
|
| 812 |
+
if ((bits & 0x7FU) > 0x7CU) {
|
| 813 |
+
// NaN
|
| 814 |
+
i = 0;
|
| 815 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 816 |
+
// saturate maximum
|
| 817 |
+
i = max_val;
|
| 818 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 819 |
+
// saturate minimum
|
| 820 |
+
i = min_val;
|
| 821 |
+
} else {
|
| 822 |
+
// normal value
|
| 823 |
+
i = static_cast<signed char>(f);
|
| 824 |
+
}
|
| 825 |
+
return i;
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
+
/**
|
| 829 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 830 |
+
* Conversion operator to an implementation defined \p char data type.
|
| 831 |
+
*
|
| 832 |
+
* Detects signedness of the \p char type and proceeds accordingly, see
|
| 833 |
+
* further details in signed and unsigned char operators.
|
| 834 |
+
|
| 835 |
+
* Clamps inputs to the output range.
|
| 836 |
+
* \p NaN inputs convert to \p zero.
|
| 837 |
+
*/
|
| 838 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
|
| 839 |
+
char value;
|
| 840 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 841 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 842 |
+
#pragma warning (push)
|
| 843 |
+
#pragma warning (disable: 4127)
|
| 844 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 845 |
+
if (((char)-1) < (char)0)
|
| 846 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 847 |
+
#pragma warning (pop)
|
| 848 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 849 |
+
{
|
| 850 |
+
value = static_cast<char>(static_cast<signed char>(*this));
|
| 851 |
+
}
|
| 852 |
+
else
|
| 853 |
+
{
|
| 854 |
+
value = static_cast<char>(static_cast<unsigned char>(*this));
|
| 855 |
+
}
|
| 856 |
+
return value;
|
| 857 |
+
}
|
| 858 |
+
|
| 859 |
+
/**
|
| 860 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 861 |
+
* Conversion operator to \p short \p int data type.
|
| 862 |
+
* Clamps too large inputs to the output range.
|
| 863 |
+
* \p NaN inputs convert to \p zero.
|
| 864 |
+
*/
|
| 865 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
|
| 866 |
+
return __half2short_rz(__half(*this));
|
| 867 |
+
}
|
| 868 |
+
/**
|
| 869 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 870 |
+
* Conversion operator to \p int data type.
|
| 871 |
+
* Clamps too large inputs to the output range.
|
| 872 |
+
* \p NaN inputs convert to \p zero.
|
| 873 |
+
*/
|
| 874 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
|
| 875 |
+
return __half2int_rz(__half(*this));
|
| 876 |
+
}
|
| 877 |
+
/**
|
| 878 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 879 |
+
* Conversion operator to \p long \p int data type.
|
| 880 |
+
* Clamps too large inputs to the output range.
|
| 881 |
+
* \p NaN inputs convert to \p zero if output type is 32-bit.
|
| 882 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
|
| 883 |
+
*/
|
| 884 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
|
| 885 |
+
long retval;
|
| 886 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 887 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 888 |
+
#pragma warning (push)
|
| 889 |
+
#pragma warning (disable: 4127)
|
| 890 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 891 |
+
if (sizeof(long) == sizeof(long long))
|
| 892 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 893 |
+
#pragma warning (pop)
|
| 894 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 895 |
+
{
|
| 896 |
+
retval = static_cast<long>(__half2ll_rz(__half(*this)));
|
| 897 |
+
}
|
| 898 |
+
else
|
| 899 |
+
{
|
| 900 |
+
retval = static_cast<long>(__half2int_rz(__half(*this)));
|
| 901 |
+
}
|
| 902 |
+
return retval;
|
| 903 |
+
}
|
| 904 |
+
/**
|
| 905 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 906 |
+
* Conversion operator to \p long \p long \p int data type.
|
| 907 |
+
* Clamps too large inputs to the output range.
|
| 908 |
+
* \p NaN inputs convert to \p 0x8000000000000000LL.
|
| 909 |
+
*/
|
| 910 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
|
| 911 |
+
return __half2ll_rz(__half(*this));
|
| 912 |
+
}
|
| 913 |
+
|
| 914 |
+
/**
|
| 915 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 916 |
+
* Conversion operator to \p bool data type.
|
| 917 |
+
* +0 and -0 inputs convert to \p false.
|
| 918 |
+
* Non-zero inputs convert to \p true.
|
| 919 |
+
*/
|
| 920 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
|
| 921 |
+
return (__x & 0x7FU) != 0U;
|
| 922 |
+
}
|
| 923 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 924 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 925 |
+
};
|
| 926 |
+
|
| 927 |
+
/**
|
| 928 |
+
* \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
|
| 929 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 930 |
+
*/
|
| 931 |
+
|
| 932 |
+
/**
|
| 933 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 934 |
+
* \brief __nv_fp8x2_e5m2 datatype
|
| 935 |
+
*
|
| 936 |
+
* \details This structure implements the datatype for handling two
|
| 937 |
+
* \p fp8 floating-point numbers of \p e5m2 kind each:
|
| 938 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 939 |
+
*
|
| 940 |
+
* The structure implements converting constructors and operators.
|
| 941 |
+
*/
|
| 942 |
+
struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
|
| 943 |
+
public:
|
| 944 |
+
/**
|
| 945 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 946 |
+
* Storage variable contains the vector of two \p fp8 floating-point data
|
| 947 |
+
* values.
|
| 948 |
+
*/
|
| 949 |
+
__nv_fp8x2_storage_t __x;
|
| 950 |
+
|
| 951 |
+
/**
|
| 952 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 953 |
+
* Constructor by default.
|
| 954 |
+
*/
|
| 955 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 956 |
+
__nv_fp8x2_e5m2() = default;
|
| 957 |
+
#else
|
| 958 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
|
| 959 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 960 |
+
|
| 961 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 962 |
+
|
| 963 |
+
/* Construct from wider types */
|
| 964 |
+
|
| 965 |
+
/**
|
| 966 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 967 |
+
* Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
|
| 968 |
+
* behavior for out-of-range values.
|
| 969 |
+
*/
|
| 970 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
|
| 971 |
+
__x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
|
| 972 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 973 |
+
}
|
| 974 |
+
/**
|
| 975 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 976 |
+
* Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
|
| 977 |
+
* behavior for out-of-range values.
|
| 978 |
+
*/
|
| 979 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
|
| 980 |
+
__x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
|
| 981 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 982 |
+
}
|
| 983 |
+
/**
|
| 984 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 985 |
+
* Constructor from \p float2 data type, relies on \p __NV_SATFINITE
|
| 986 |
+
* behavior for out-of-range values.
|
| 987 |
+
*/
|
| 988 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
|
| 989 |
+
__x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
|
| 990 |
+
}
|
| 991 |
+
/**
|
| 992 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 993 |
+
* Constructor from \p double2 data type, relies on \p __NV_SATFINITE
|
| 994 |
+
* behavior for out-of-range values.
|
| 995 |
+
*/
|
| 996 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
|
| 997 |
+
__x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
|
| 998 |
+
}
|
| 999 |
+
|
| 1000 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1001 |
+
/* Widening converts */
|
| 1002 |
+
/**
|
| 1003 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 1004 |
+
* Conversion operator to \p __half2 data type.
|
| 1005 |
+
*/
|
| 1006 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
|
| 1007 |
+
return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
|
| 1008 |
+
}
|
| 1009 |
+
/**
|
| 1010 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 1011 |
+
* Conversion operator to \p float2 data type.
|
| 1012 |
+
*/
|
| 1013 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
|
| 1014 |
+
return __internal_halfraw2_to_float2(
|
| 1015 |
+
__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
|
| 1016 |
+
}
|
| 1017 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1018 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1019 |
+
};
|
| 1020 |
+
|
| 1021 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
|
| 1022 |
+
__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
|
| 1023 |
+
const unsigned short int src_hi) {
|
| 1024 |
+
unsigned int dst;
|
| 1025 |
+
#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
|
| 1026 |
+
asm("{ mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
|
| 1027 |
+
#else
|
| 1028 |
+
dst = (static_cast<unsigned int>(src_hi) << 16U) |
|
| 1029 |
+
static_cast<unsigned int>(src_lo);
|
| 1030 |
+
#endif
|
| 1031 |
+
return dst;
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
/**
|
| 1035 |
+
* \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
|
| 1036 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1037 |
+
*/
|
| 1038 |
+
|
| 1039 |
+
/**
|
| 1040 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1041 |
+
* \brief __nv_fp8x4_e5m2 datatype
|
| 1042 |
+
*
|
| 1043 |
+
* \details This structure implements the datatype for handling four
|
| 1044 |
+
* \p fp8 floating-point numbers of \p e5m2 kind each:
|
| 1045 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 1046 |
+
*
|
| 1047 |
+
* The structure implements converting constructors and operators.
|
| 1048 |
+
*/
|
| 1049 |
+
struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
|
| 1050 |
+
public:
|
| 1051 |
+
/**
|
| 1052 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1053 |
+
* Storage variable contains the vector of four \p fp8 floating-point data
|
| 1054 |
+
* values.
|
| 1055 |
+
*/
|
| 1056 |
+
__nv_fp8x4_storage_t __x;
|
| 1057 |
+
|
| 1058 |
+
/**
|
| 1059 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1060 |
+
* Constructor by default.
|
| 1061 |
+
*/
|
| 1062 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1063 |
+
__nv_fp8x4_e5m2() = default;
|
| 1064 |
+
#else
|
| 1065 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
|
| 1066 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1067 |
+
|
| 1068 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1069 |
+
|
| 1070 |
+
/* Construct from wider types */
|
| 1071 |
+
|
| 1072 |
+
/**
|
| 1073 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1074 |
+
* Constructor from a pair of \p __half2 data type values,
|
| 1075 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1076 |
+
*/
|
| 1077 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
|
| 1078 |
+
const __half2 fhi) {
|
| 1079 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
|
| 1080 |
+
static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
|
| 1081 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
|
| 1082 |
+
static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
|
| 1083 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1084 |
+
}
|
| 1085 |
+
/**
|
| 1086 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1087 |
+
* Constructor from a pair of \p __nv_bfloat162 data type values,
|
| 1088 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1089 |
+
*/
|
| 1090 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
|
| 1091 |
+
const __nv_bfloat162 fhi) {
|
| 1092 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1093 |
+
static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
|
| 1094 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1095 |
+
static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
|
| 1096 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1097 |
+
}
|
| 1098 |
+
/**
|
| 1099 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1100 |
+
* Constructor from \p float4 vector data type,
|
| 1101 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1102 |
+
*/
|
| 1103 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
|
| 1104 |
+
const float2 flo = {f.x, f.y};
|
| 1105 |
+
const float2 fhi = {f.z, f.w};
|
| 1106 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1107 |
+
__nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
|
| 1108 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1109 |
+
__nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
|
| 1110 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1111 |
+
}
|
| 1112 |
+
/**
|
| 1113 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1114 |
+
* Constructor from \p double4 vector data type,
|
| 1115 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1116 |
+
*/
|
| 1117 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
|
| 1118 |
+
const double2 flo = {f.x, f.y};
|
| 1119 |
+
const double2 fhi = {f.z, f.w};
|
| 1120 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1121 |
+
__nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
|
| 1122 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1123 |
+
__nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
|
| 1124 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1125 |
+
}
|
| 1126 |
+
|
| 1127 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1128 |
+
/* Widening converts */
|
| 1129 |
+
|
| 1130 |
+
/**
|
| 1131 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1132 |
+
* Conversion operator to \p float4 vector data type.
|
| 1133 |
+
*/
|
| 1134 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
|
| 1135 |
+
const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
|
| 1136 |
+
const __nv_fp8x2_storage_t shi =
|
| 1137 |
+
static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
|
| 1138 |
+
float2 rlo = __internal_halfraw2_to_float2(
|
| 1139 |
+
__nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
|
| 1140 |
+
float2 rhi = __internal_halfraw2_to_float2(
|
| 1141 |
+
__nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
|
| 1142 |
+
float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
|
| 1143 |
+
return res;
|
| 1144 |
+
}
|
| 1145 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1146 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1147 |
+
};
|
| 1148 |
+
|
| 1149 |
+
/**
|
| 1150 |
+
* \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
|
| 1151 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1152 |
+
*/
|
| 1153 |
+
|
| 1154 |
+
/**
|
| 1155 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1156 |
+
* \brief __nv_fp8_e4m3 datatype
|
| 1157 |
+
*
|
| 1158 |
+
* \details This structure implements the datatype for storing
|
| 1159 |
+
* \p fp8 floating-point numbers of \p e4m3 kind:
|
| 1160 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1161 |
+
* The encoding doesn't support Infinity.
|
| 1162 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1163 |
+
*
|
| 1164 |
+
* The structure implements converting constructors and operators.
|
| 1165 |
+
*/
|
| 1166 |
+
struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
|
| 1167 |
+
public:
|
| 1168 |
+
/**
|
| 1169 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1170 |
+
* Storage variable contains the \p fp8 floating-point data.
|
| 1171 |
+
*/
|
| 1172 |
+
__nv_fp8_storage_t __x;
|
| 1173 |
+
|
| 1174 |
+
/**
|
| 1175 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1176 |
+
* Constructor by default.
|
| 1177 |
+
*/
|
| 1178 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1179 |
+
__nv_fp8_e4m3() = default;
|
| 1180 |
+
#else
|
| 1181 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
|
| 1182 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1183 |
+
|
| 1184 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1185 |
+
|
| 1186 |
+
/* Construct from wider FP types */
|
| 1187 |
+
/* Note we do avoid constructor init-list because of special host/device
|
| 1188 |
+
* compilation rules */
|
| 1189 |
+
|
| 1190 |
+
/**
|
| 1191 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1192 |
+
* Constructor from \p __half data type, relies on \p __NV_SATFINITE
|
| 1193 |
+
* behavior for out-of-range values.
|
| 1194 |
+
*/
|
| 1195 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
|
| 1196 |
+
__x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
|
| 1197 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1198 |
+
}
|
| 1199 |
+
/**
|
| 1200 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1201 |
+
* Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
|
| 1202 |
+
* behavior for out-of-range values.
|
| 1203 |
+
*/
|
| 1204 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
|
| 1205 |
+
__x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
|
| 1206 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1207 |
+
}
|
| 1208 |
+
/**
|
| 1209 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1210 |
+
* Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
|
| 1211 |
+
* for out-of-range values.
|
| 1212 |
+
*/
|
| 1213 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
|
| 1214 |
+
__x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
|
| 1215 |
+
}
|
| 1216 |
+
/**
|
| 1217 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1218 |
+
* Constructor from \p double data type, relies on \p __NV_SATFINITE
|
| 1219 |
+
* behavior for out-of-range values.
|
| 1220 |
+
*/
|
| 1221 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
|
| 1222 |
+
__x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
|
| 1223 |
+
}
|
| 1224 |
+
|
| 1225 |
+
/* Converts from integral */
|
| 1226 |
+
|
| 1227 |
+
/**
|
| 1228 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1229 |
+
* Constructor from \p unsigned \p short \p int data type, relies on \p
|
| 1230 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1231 |
+
*/
|
| 1232 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 1233 |
+
__nv_fp8_e4m3(const unsigned short int val) {
|
| 1234 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1235 |
+
}
|
| 1236 |
+
/**
|
| 1237 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1238 |
+
* Constructor from \p unsigned \p int data type, relies on \p
|
| 1239 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1240 |
+
*/
|
| 1241 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
|
| 1242 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1243 |
+
}
|
| 1244 |
+
/**
|
| 1245 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1246 |
+
* Constructor from \p unsigned \p long \p int data type, relies on \p
|
| 1247 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1248 |
+
*/
|
| 1249 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned long int val) {
|
| 1250 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1251 |
+
}
|
| 1252 |
+
/**
|
| 1253 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1254 |
+
* Constructor from \p unsigned \p long \p long \p int data type, relies on
|
| 1255 |
+
* \p __NV_SATFINITE behavior for out-of-range values.
|
| 1256 |
+
*/
|
| 1257 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 1258 |
+
__nv_fp8_e4m3(const unsigned long long int val) {
|
| 1259 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1260 |
+
}
|
| 1261 |
+
|
| 1262 |
+
/**
|
| 1263 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1264 |
+
* Constructor from \p short \p int data type, relies on \p
|
| 1265 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1266 |
+
*/
|
| 1267 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
|
| 1268 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1269 |
+
}
|
| 1270 |
+
/**
|
| 1271 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1272 |
+
* Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
|
| 1273 |
+
* for out-of-range values.
|
| 1274 |
+
*/
|
| 1275 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
|
| 1276 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1277 |
+
}
|
| 1278 |
+
/**
|
| 1279 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1280 |
+
* Constructor from \p long \p int data type, relies on \p
|
| 1281 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1282 |
+
*/
|
| 1283 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long int val) {
|
| 1284 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1285 |
+
}
|
| 1286 |
+
/**
|
| 1287 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1288 |
+
* Constructor from \p long \p long \p int data type, relies on \p
|
| 1289 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1290 |
+
*/
|
| 1291 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
|
| 1292 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1293 |
+
}
|
| 1294 |
+
|
| 1295 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1296 |
+
/* Widening FP converts */
|
| 1297 |
+
/**
|
| 1298 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1299 |
+
* Conversion operator to \p __half data type.
|
| 1300 |
+
*/
|
| 1301 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
|
| 1302 |
+
return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
|
| 1303 |
+
}
|
| 1304 |
+
/**
|
| 1305 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1306 |
+
* Conversion operator to \p float data type.
|
| 1307 |
+
*/
|
| 1308 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
|
| 1309 |
+
return __internal_halfraw_to_float(
|
| 1310 |
+
__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
|
| 1311 |
+
}
|
| 1312 |
+
/**
|
| 1313 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1314 |
+
* Conversion operator to \p __nv_bfloat16 data type.
|
| 1315 |
+
*/
|
| 1316 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
|
| 1317 |
+
return static_cast<__nv_bfloat16>(
|
| 1318 |
+
__internal_float_to_bf16raw_rz(float(*this)));
|
| 1319 |
+
}
|
| 1320 |
+
/**
|
| 1321 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1322 |
+
* Conversion operator to \p double data type.
|
| 1323 |
+
*/
|
| 1324 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
|
| 1325 |
+
return static_cast<double>(float(*this));
|
| 1326 |
+
}
|
| 1327 |
+
|
| 1328 |
+
/* Convert to integral */
|
| 1329 |
+
|
| 1330 |
+
/**
|
| 1331 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1332 |
+
* Conversion operator to \p unsigned \p char data type.
|
| 1333 |
+
* Clamps negative and too large inputs to the output range.
|
| 1334 |
+
* \p NaN inputs convert to \p zero.
|
| 1335 |
+
*/
|
| 1336 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
|
| 1337 |
+
unsigned char i;
|
| 1338 |
+
const float f = float(*this);
|
| 1339 |
+
const unsigned char max_val = 0xFFU;
|
| 1340 |
+
const unsigned char min_val = 0U;
|
| 1341 |
+
const unsigned char bits = (*this).__x;
|
| 1342 |
+
// saturation fixup
|
| 1343 |
+
if ((bits & 0x7FU) == 0x7FU) {
|
| 1344 |
+
// NaN
|
| 1345 |
+
i = 0;
|
| 1346 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 1347 |
+
// saturate maximum
|
| 1348 |
+
i = max_val;
|
| 1349 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 1350 |
+
// saturate minimum
|
| 1351 |
+
i = min_val;
|
| 1352 |
+
} else {
|
| 1353 |
+
// normal value
|
| 1354 |
+
i = static_cast<unsigned char>(f);
|
| 1355 |
+
}
|
| 1356 |
+
return i;
|
| 1357 |
+
}
|
| 1358 |
+
|
| 1359 |
+
/**
|
| 1360 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1361 |
+
* Conversion operator to \p unsigned \p short \p int data type.
|
| 1362 |
+
* Clamps negative inputs to zero.
|
| 1363 |
+
* \p NaN inputs convert to \p zero.
|
| 1364 |
+
*/
|
| 1365 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
|
| 1366 |
+
return __half2ushort_rz(__half(*this));
|
| 1367 |
+
}
|
| 1368 |
+
/**
|
| 1369 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1370 |
+
* Conversion operator to \p unsigned \p int data type.
|
| 1371 |
+
* Clamps negative inputs to zero.
|
| 1372 |
+
* \p NaN inputs convert to \p zero.
|
| 1373 |
+
*/
|
| 1374 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
|
| 1375 |
+
return __half2uint_rz(__half(*this));
|
| 1376 |
+
}
|
| 1377 |
+
/**
|
| 1378 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1379 |
+
* Conversion operator to \p unsigned \p long \p int data type.
|
| 1380 |
+
* Clamps negative and too large inputs to the output range.
|
| 1381 |
+
* \p NaN inputs convert to \p zero if output type is 32-bit.
|
| 1382 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
|
| 1383 |
+
*/
|
| 1384 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
|
| 1385 |
+
unsigned long retval;
|
| 1386 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 1387 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1388 |
+
#pragma warning (push)
|
| 1389 |
+
#pragma warning (disable: 4127)
|
| 1390 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1391 |
+
if (sizeof(unsigned long) == sizeof(unsigned long long))
|
| 1392 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1393 |
+
#pragma warning (pop)
|
| 1394 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1395 |
+
{
|
| 1396 |
+
retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
|
| 1397 |
+
}
|
| 1398 |
+
else
|
| 1399 |
+
{
|
| 1400 |
+
retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
|
| 1401 |
+
}
|
| 1402 |
+
return retval;
|
| 1403 |
+
}
|
| 1404 |
+
/**
|
| 1405 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1406 |
+
* Conversion operator to \p unsigned \p long \p long \p int data type.
|
| 1407 |
+
* Clamps negative inputs to zero.
|
| 1408 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL.
|
| 1409 |
+
*/
|
| 1410 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
|
| 1411 |
+
return __half2ull_rz(__half(*this));
|
| 1412 |
+
}
|
| 1413 |
+
|
| 1414 |
+
/**
|
| 1415 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1416 |
+
* Conversion operator to \p signed \p char data type.
|
| 1417 |
+
* Clamps too large inputs to the output range.
|
| 1418 |
+
* \p NaN inputs convert to \p zero.
|
| 1419 |
+
*/
|
| 1420 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
|
| 1421 |
+
signed char i;
|
| 1422 |
+
const float f = float(*this);
|
| 1423 |
+
const signed char max_val = (signed char)0x7FU;
|
| 1424 |
+
const signed char min_val = (signed char)0x80U;
|
| 1425 |
+
const unsigned char bits = (*this).__x;
|
| 1426 |
+
// saturation fixup
|
| 1427 |
+
if ((bits & 0x7FU) == 0x7FU) {
|
| 1428 |
+
// NaN
|
| 1429 |
+
i = 0;
|
| 1430 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 1431 |
+
// saturate maximum
|
| 1432 |
+
i = max_val;
|
| 1433 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 1434 |
+
// saturate minimum
|
| 1435 |
+
i = min_val;
|
| 1436 |
+
} else {
|
| 1437 |
+
// normal value
|
| 1438 |
+
i = static_cast<signed char>(f);
|
| 1439 |
+
}
|
| 1440 |
+
return i;
|
| 1441 |
+
}
|
| 1442 |
+
|
| 1443 |
+
/**
|
| 1444 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1445 |
+
* Conversion operator to an implementation defined \p char data type.
|
| 1446 |
+
*
|
| 1447 |
+
* Detects signedness of the \p char type and proceeds accordingly, see
|
| 1448 |
+
* further details in signed and unsigned char operators.
|
| 1449 |
+
|
| 1450 |
+
* Clamps inputs to the output range.
|
| 1451 |
+
* \p NaN inputs convert to \p zero.
|
| 1452 |
+
*/
|
| 1453 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
|
| 1454 |
+
char value;
|
| 1455 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 1456 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1457 |
+
#pragma warning (push)
|
| 1458 |
+
#pragma warning (disable: 4127)
|
| 1459 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1460 |
+
if (((char)-1) < (char)0)
|
| 1461 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1462 |
+
#pragma warning (pop)
|
| 1463 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1464 |
+
{
|
| 1465 |
+
value = static_cast<char>(static_cast<signed char>(*this));
|
| 1466 |
+
}
|
| 1467 |
+
else
|
| 1468 |
+
{
|
| 1469 |
+
value = static_cast<char>(static_cast<unsigned char>(*this));
|
| 1470 |
+
}
|
| 1471 |
+
return value;
|
| 1472 |
+
}
|
| 1473 |
+
|
| 1474 |
+
/**
|
| 1475 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1476 |
+
* Conversion operator to \p short \p int data type.
|
| 1477 |
+
* \p NaN inputs convert to \p zero.
|
| 1478 |
+
*/
|
| 1479 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
|
| 1480 |
+
return __half2short_rz(__half(*this));
|
| 1481 |
+
}
|
| 1482 |
+
/**
|
| 1483 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1484 |
+
* Conversion operator to \p int data type.
|
| 1485 |
+
* \p NaN inputs convert to \p zero.
|
| 1486 |
+
*/
|
| 1487 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
|
| 1488 |
+
return __half2int_rz(__half(*this));
|
| 1489 |
+
}
|
| 1490 |
+
/**
|
| 1491 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1492 |
+
* Conversion operator to \p long \p int data type.
|
| 1493 |
+
* Clamps too large inputs to the output range.
|
| 1494 |
+
* \p NaN inputs convert to \p zero if output type is 32-bit.
|
| 1495 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
|
| 1496 |
+
*/
|
| 1497 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
|
| 1498 |
+
long retval;
|
| 1499 |
+
/* Suppress VS warning: warning C4127: conditional expression is constant */
|
| 1500 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1501 |
+
#pragma warning (push)
|
| 1502 |
+
#pragma warning (disable: 4127)
|
| 1503 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1504 |
+
if (sizeof(long) == sizeof(long long))
|
| 1505 |
+
#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
|
| 1506 |
+
#pragma warning (pop)
|
| 1507 |
+
#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
|
| 1508 |
+
{
|
| 1509 |
+
retval = static_cast<long>(__half2ll_rz(__half(*this)));
|
| 1510 |
+
}
|
| 1511 |
+
else
|
| 1512 |
+
{
|
| 1513 |
+
retval = static_cast<long>(__half2int_rz(__half(*this)));
|
| 1514 |
+
}
|
| 1515 |
+
return retval;
|
| 1516 |
+
}
|
| 1517 |
+
/**
|
| 1518 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1519 |
+
* Conversion operator to \p long \p long \p int data type.
|
| 1520 |
+
* \p NaN inputs convert to \p 0x8000000000000000LL.
|
| 1521 |
+
*/
|
| 1522 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
|
| 1523 |
+
return __half2ll_rz(__half(*this));
|
| 1524 |
+
}
|
| 1525 |
+
|
| 1526 |
+
/**
|
| 1527 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1528 |
+
* Conversion operator to \p bool data type.
|
| 1529 |
+
* +0 and -0 inputs convert to \p false.
|
| 1530 |
+
* Non-zero inputs convert to \p true.
|
| 1531 |
+
*/
|
| 1532 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
|
| 1533 |
+
return (__x & 0x7FU) != 0U;
|
| 1534 |
+
}
|
| 1535 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1536 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1537 |
+
};
|
| 1538 |
+
|
| 1539 |
+
/**
|
| 1540 |
+
* \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
|
| 1541 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1542 |
+
*/
|
| 1543 |
+
|
| 1544 |
+
/**
|
| 1545 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1546 |
+
* \brief __nv_fp8x2_e4m3 datatype
|
| 1547 |
+
*
|
| 1548 |
+
* \details This structure implements the datatype for storage
|
| 1549 |
+
* and operations on the vector of two \p fp8 values of \p e4m3 kind each:
|
| 1550 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1551 |
+
* The encoding doesn't support Infinity.
|
| 1552 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1553 |
+
*/
|
| 1554 |
+
struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
|
| 1555 |
+
public:
|
| 1556 |
+
/**
|
| 1557 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1558 |
+
* Storage variable contains the vector of two \p fp8 floating-point data
|
| 1559 |
+
* values.
|
| 1560 |
+
*/
|
| 1561 |
+
__nv_fp8x2_storage_t __x;
|
| 1562 |
+
|
| 1563 |
+
/**
|
| 1564 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1565 |
+
* Constructor by default.
|
| 1566 |
+
*/
|
| 1567 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1568 |
+
__nv_fp8x2_e4m3() = default;
|
| 1569 |
+
#else
|
| 1570 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
|
| 1571 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1572 |
+
|
| 1573 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1574 |
+
|
| 1575 |
+
/* Construct from wider types */
|
| 1576 |
+
|
| 1577 |
+
/**
|
| 1578 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1579 |
+
* Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
|
| 1580 |
+
* behavior for out-of-range values.
|
| 1581 |
+
*/
|
| 1582 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
|
| 1583 |
+
__x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
|
| 1584 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1585 |
+
}
|
| 1586 |
+
/**
|
| 1587 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1588 |
+
* Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
|
| 1589 |
+
* behavior for out-of-range values.
|
| 1590 |
+
*/
|
| 1591 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
|
| 1592 |
+
__x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
|
| 1593 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1594 |
+
}
|
| 1595 |
+
/**
|
| 1596 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1597 |
+
* Constructor from \p float2 data type, relies on \p __NV_SATFINITE
|
| 1598 |
+
* behavior for out-of-range values.
|
| 1599 |
+
*/
|
| 1600 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
|
| 1601 |
+
__x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
|
| 1602 |
+
}
|
| 1603 |
+
/**
|
| 1604 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1605 |
+
* Constructor from \p double2 data type, relies on \p __NV_SATFINITE
|
| 1606 |
+
* behavior for out-of-range values.
|
| 1607 |
+
*/
|
| 1608 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
|
| 1609 |
+
__x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
|
| 1610 |
+
}
|
| 1611 |
+
|
| 1612 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1613 |
+
/* Widening converts */
|
| 1614 |
+
/**
|
| 1615 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1616 |
+
* Conversion operator to \p __half2 data type.
|
| 1617 |
+
*/
|
| 1618 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
|
| 1619 |
+
return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
|
| 1620 |
+
}
|
| 1621 |
+
/**
|
| 1622 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1623 |
+
* Conversion operator to \p float2 data type.
|
| 1624 |
+
*/
|
| 1625 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
|
| 1626 |
+
return __internal_halfraw2_to_float2(
|
| 1627 |
+
__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
|
| 1628 |
+
}
|
| 1629 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1630 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1631 |
+
};
|
| 1632 |
+
|
| 1633 |
+
/**
|
| 1634 |
+
* \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
|
| 1635 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1636 |
+
*/
|
| 1637 |
+
|
| 1638 |
+
/**
|
| 1639 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1640 |
+
* \brief __nv_fp8x4_e4m3 datatype
|
| 1641 |
+
*
|
| 1642 |
+
* \details This structure implements the datatype for storage
|
| 1643 |
+
* and operations on the vector of four \p fp8 values of \p e4m3 kind each:
|
| 1644 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1645 |
+
* The encoding doesn't support Infinity.
|
| 1646 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1647 |
+
*/
|
| 1648 |
+
struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
|
| 1649 |
+
public:
|
| 1650 |
+
/**
|
| 1651 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1652 |
+
* Storage variable contains the vector of four \p fp8 floating-point data
|
| 1653 |
+
* values.
|
| 1654 |
+
*/
|
| 1655 |
+
__nv_fp8x4_storage_t __x;
|
| 1656 |
+
|
| 1657 |
+
/**
|
| 1658 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1659 |
+
* Constructor by default.
|
| 1660 |
+
*/
|
| 1661 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1662 |
+
__nv_fp8x4_e4m3() = default;
|
| 1663 |
+
#else
|
| 1664 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
|
| 1665 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1666 |
+
|
| 1667 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1668 |
+
|
| 1669 |
+
/* Construct from wider types */
|
| 1670 |
+
|
| 1671 |
+
/**
|
| 1672 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1673 |
+
* Constructor from a pair of \p __half2 data type values,
|
| 1674 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1675 |
+
*/
|
| 1676 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
|
| 1677 |
+
const __half2 fhi) {
|
| 1678 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
|
| 1679 |
+
static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
|
| 1680 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
|
| 1681 |
+
static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
|
| 1682 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1683 |
+
}
|
| 1684 |
+
/**
|
| 1685 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1686 |
+
* Constructor from a pair of \p __nv_bfloat162 data type values,
|
| 1687 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1688 |
+
*/
|
| 1689 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
|
| 1690 |
+
const __nv_bfloat162 fhi) {
|
| 1691 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1692 |
+
static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
|
| 1693 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1694 |
+
static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
|
| 1695 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1696 |
+
}
|
| 1697 |
+
/**
|
| 1698 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1699 |
+
* Constructor from \p float4 vector data type,
|
| 1700 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1701 |
+
*/
|
| 1702 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
|
| 1703 |
+
const float2 flo = {f.x, f.y};
|
| 1704 |
+
const float2 fhi = {f.z, f.w};
|
| 1705 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1706 |
+
__nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
|
| 1707 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1708 |
+
__nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
|
| 1709 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1710 |
+
}
|
| 1711 |
+
/**
|
| 1712 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1713 |
+
* Constructor from \p double4 vector data type,
|
| 1714 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1715 |
+
*/
|
| 1716 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
|
| 1717 |
+
const double2 flo = {f.x, f.y};
|
| 1718 |
+
const double2 fhi = {f.z, f.w};
|
| 1719 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1720 |
+
__nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
|
| 1721 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1722 |
+
__nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
|
| 1723 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1724 |
+
}
|
| 1725 |
+
|
| 1726 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1727 |
+
/* Widening converts */
|
| 1728 |
+
|
| 1729 |
+
/**
|
| 1730 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1731 |
+
* Conversion operator to \p float4 vector data type.
|
| 1732 |
+
*/
|
| 1733 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
|
| 1734 |
+
const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
|
| 1735 |
+
const __nv_fp8x2_storage_t shi =
|
| 1736 |
+
static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
|
| 1737 |
+
float2 rlo = __internal_halfraw2_to_float2(
|
| 1738 |
+
__nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
|
| 1739 |
+
float2 rhi = __internal_halfraw2_to_float2(
|
| 1740 |
+
__nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
|
| 1741 |
+
float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
|
| 1742 |
+
return res;
|
| 1743 |
+
}
|
| 1744 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1745 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1746 |
+
};
|
| 1747 |
+
|
| 1748 |
+
#endif /* defined(__cplusplus) */
|
| 1749 |
+
|
| 1750 |
+
#endif /* end of include guard: __CUDA_FP8_HPP__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_GL_INTEROP_H__)
|
| 51 |
+
#define __CUDA_GL_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
|
| 55 |
+
#if defined(__APPLE__)
|
| 56 |
+
|
| 57 |
+
#include <OpenGL/gl.h>
|
| 58 |
+
|
| 59 |
+
#else /* __APPLE__ */
|
| 60 |
+
|
| 61 |
+
#if defined(__arm__) || defined(__aarch64__)
|
| 62 |
+
#ifndef GL_VERSION
|
| 63 |
+
#error Please include the appropriate gl headers before including cuda_gl_interop.h
|
| 64 |
+
#endif
|
| 65 |
+
#else
|
| 66 |
+
#include <GL/gl.h>
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
#endif /* __APPLE__ */
|
| 70 |
+
|
| 71 |
+
/** \cond impl_private */
|
| 72 |
+
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 73 |
+
#define __CUDA_DEPRECATED
|
| 74 |
+
#elif defined(_MSC_VER)
|
| 75 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 76 |
+
#elif defined(__GNUC__)
|
| 77 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 78 |
+
#else
|
| 79 |
+
#define __CUDA_DEPRECATED
|
| 80 |
+
#endif
|
| 81 |
+
/** \endcond impl_private */
|
| 82 |
+
|
| 83 |
+
#if defined(__cplusplus)
|
| 84 |
+
extern "C" {
|
| 85 |
+
#endif /* __cplusplus */
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* \addtogroup CUDART_OPENGL OpenGL Interoperability
|
| 89 |
+
* This section describes the OpenGL interoperability functions of the CUDA
|
| 90 |
+
* runtime application programming interface. Note that mapping of OpenGL
|
| 91 |
+
* resources is performed with the graphics API agnostic, resource mapping
|
| 92 |
+
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
|
| 93 |
+
*
|
| 94 |
+
* @{
|
| 95 |
+
*/
|
| 96 |
+
|
| 97 |
+
/**
|
| 98 |
+
* CUDA devices corresponding to the current OpenGL context
|
| 99 |
+
*/
|
| 100 |
+
enum cudaGLDeviceList
|
| 101 |
+
{
|
| 102 |
+
cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
|
| 103 |
+
cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
|
| 104 |
+
cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
/**
|
| 108 |
+
* \brief Gets the CUDA devices associated with the current OpenGL context
|
| 109 |
+
*
|
| 110 |
+
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
|
| 111 |
+
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
|
| 112 |
+
* at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
|
| 113 |
+
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
|
| 114 |
+
* context are not CUDA capable then the call will return ::cudaErrorNoDevice.
|
| 115 |
+
*
|
| 116 |
+
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
|
| 117 |
+
* current OpenGL context
|
| 118 |
+
* \param pCudaDevices - Returned CUDA devices corresponding to the current
|
| 119 |
+
* OpenGL context
|
| 120 |
+
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
|
| 121 |
+
* \param deviceList - The set of devices to return. This set may be
|
| 122 |
+
* ::cudaGLDeviceListAll for all devices,
|
| 123 |
+
* ::cudaGLDeviceListCurrentFrame for the devices used to
|
| 124 |
+
* render the current frame (in SLI), or
|
| 125 |
+
* ::cudaGLDeviceListNextFrame for the devices used to
|
| 126 |
+
* render the next frame (in SLI).
|
| 127 |
+
*
|
| 128 |
+
* \return
|
| 129 |
+
* ::cudaSuccess,
|
| 130 |
+
* ::cudaErrorNoDevice,
|
| 131 |
+
* ::cudaErrorInvalidGraphicsContext,
|
| 132 |
+
* ::cudaErrorOperatingSystem,
|
| 133 |
+
* ::cudaErrorUnknown
|
| 134 |
+
*
|
| 135 |
+
* \note This function is not supported on Mac OS X.
|
| 136 |
+
* \notefnerr
|
| 137 |
+
*
|
| 138 |
+
* \sa
|
| 139 |
+
* ::cudaGraphicsUnregisterResource,
|
| 140 |
+
* ::cudaGraphicsMapResources,
|
| 141 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 142 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 143 |
+
* ::cuGLGetDevices
|
| 144 |
+
*/
|
| 145 |
+
extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
|
| 146 |
+
|
| 147 |
+
/**
|
| 148 |
+
* \brief Register an OpenGL texture or renderbuffer object
|
| 149 |
+
*
|
| 150 |
+
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
|
| 151 |
+
* A handle to the registered object is returned as \p resource.
|
| 152 |
+
*
|
| 153 |
+
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
|
| 154 |
+
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
|
| 155 |
+
* or ::GL_RENDERBUFFER.
|
| 156 |
+
*
|
| 157 |
+
* The register flags \p flags specify the intended usage, as follows:
|
| 158 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 159 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 160 |
+
* read from and written to by CUDA. This is the default value.
|
| 161 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 162 |
+
* will not write to this resource.
|
| 163 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 164 |
+
* CUDA will not read from this resource and will write over the
|
| 165 |
+
* entire contents of the resource, so none of the data previously
|
| 166 |
+
* stored in the resource will be preserved.
|
| 167 |
+
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
|
| 168 |
+
* bind this resource to a surface reference.
|
| 169 |
+
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
|
| 170 |
+
* texture gather operations on this resource.
|
| 171 |
+
*
|
| 172 |
+
* The following image formats are supported. For brevity's sake, the list is abbreviated.
|
| 173 |
+
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
|
| 174 |
+
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
|
| 175 |
+
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
|
| 176 |
+
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
|
| 177 |
+
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
|
| 178 |
+
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
|
| 179 |
+
*
|
| 180 |
+
* The following image classes are currently disallowed:
|
| 181 |
+
* - Textures with borders
|
| 182 |
+
* - Multisampled renderbuffers
|
| 183 |
+
*
|
| 184 |
+
* \param resource - Pointer to the returned object handle
|
| 185 |
+
* \param image - name of texture or renderbuffer object to be registered
|
| 186 |
+
* \param target - Identifies the type of object specified by \p image
|
| 187 |
+
* \param flags - Register flags
|
| 188 |
+
*
|
| 189 |
+
* \return
|
| 190 |
+
* ::cudaSuccess,
|
| 191 |
+
* ::cudaErrorInvalidDevice,
|
| 192 |
+
* ::cudaErrorInvalidValue,
|
| 193 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 194 |
+
* ::cudaErrorOperatingSystem,
|
| 195 |
+
* ::cudaErrorUnknown
|
| 196 |
+
* \notefnerr
|
| 197 |
+
*
|
| 198 |
+
* \sa
|
| 199 |
+
* ::cudaGraphicsUnregisterResource,
|
| 200 |
+
* ::cudaGraphicsMapResources,
|
| 201 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 202 |
+
* ::cuGraphicsGLRegisterImage
|
| 203 |
+
*/
|
| 204 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
|
| 205 |
+
|
| 206 |
+
/**
|
| 207 |
+
* \brief Registers an OpenGL buffer object
|
| 208 |
+
*
|
| 209 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 210 |
+
* CUDA. A handle to the registered object is returned as \p
|
| 211 |
+
* resource. The register flags \p flags specify the intended usage,
|
| 212 |
+
* as follows:
|
| 213 |
+
*
|
| 214 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 215 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 216 |
+
* read from and written to by CUDA. This is the default value.
|
| 217 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 218 |
+
* will not write to this resource.
|
| 219 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 220 |
+
* CUDA will not read from this resource and will write over the
|
| 221 |
+
* entire contents of the resource, so none of the data previously
|
| 222 |
+
* stored in the resource will be preserved.
|
| 223 |
+
*
|
| 224 |
+
* \param resource - Pointer to the returned object handle
|
| 225 |
+
* \param buffer - name of buffer object to be registered
|
| 226 |
+
* \param flags - Register flags
|
| 227 |
+
*
|
| 228 |
+
* \return
|
| 229 |
+
* ::cudaSuccess,
|
| 230 |
+
* ::cudaErrorInvalidDevice,
|
| 231 |
+
* ::cudaErrorInvalidValue,
|
| 232 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 233 |
+
* ::cudaErrorOperatingSystem,
|
| 234 |
+
* ::cudaErrorUnknown
|
| 235 |
+
* \notefnerr
|
| 236 |
+
*
|
| 237 |
+
* \sa
|
| 238 |
+
* ::cudaGraphicsUnregisterResource,
|
| 239 |
+
* ::cudaGraphicsMapResources,
|
| 240 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 241 |
+
* ::cuGraphicsGLRegisterBuffer
|
| 242 |
+
*/
|
| 243 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
|
| 244 |
+
|
| 245 |
+
#ifdef _WIN32
|
| 246 |
+
#ifndef WGL_NV_gpu_affinity
|
| 247 |
+
typedef void* HGPUNV;
|
| 248 |
+
#endif
|
| 249 |
+
|
| 250 |
+
/**
|
| 251 |
+
* \brief Gets the CUDA device associated with hGpu
|
| 252 |
+
*
|
| 253 |
+
* Returns the CUDA device associated with a hGpu, if applicable.
|
| 254 |
+
*
|
| 255 |
+
* \param device - Returns the device associated with hGpu, or -1 if hGpu is
|
| 256 |
+
* not a compute device.
|
| 257 |
+
* \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
|
| 258 |
+
*
|
| 259 |
+
* \return
|
| 260 |
+
* ::cudaSuccess
|
| 261 |
+
* \notefnerr
|
| 262 |
+
*
|
| 263 |
+
* \sa
|
| 264 |
+
* ::WGL_NV_gpu_affinity,
|
| 265 |
+
* ::cuWGLGetDevice
|
| 266 |
+
*/
|
| 267 |
+
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
|
| 268 |
+
#endif
|
| 269 |
+
|
| 270 |
+
/** @} */ /* END CUDART_OPENGL */
|
| 271 |
+
|
| 272 |
+
/**
|
| 273 |
+
* \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
|
| 274 |
+
* This section describes deprecated OpenGL interoperability functionality.
|
| 275 |
+
*
|
| 276 |
+
* @{
|
| 277 |
+
*/
|
| 278 |
+
|
| 279 |
+
/**
|
| 280 |
+
* CUDA GL Map Flags
|
| 281 |
+
*/
|
| 282 |
+
enum cudaGLMapFlags
|
| 283 |
+
{
|
| 284 |
+
cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
|
| 285 |
+
cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
|
| 286 |
+
cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* \brief Sets a CUDA device to use OpenGL interoperability
|
| 291 |
+
*
|
| 292 |
+
* \deprecated This function is deprecated as of CUDA 5.0.
|
| 293 |
+
*
|
| 294 |
+
* This function is deprecated and should no longer be used. It is
|
| 295 |
+
* no longer necessary to associate a CUDA device with an OpenGL
|
| 296 |
+
* context in order to achieve maximum interoperability performance.
|
| 297 |
+
*
|
| 298 |
+
* This function will immediately initialize the primary context on
|
| 299 |
+
* \p device if needed.
|
| 300 |
+
*
|
| 301 |
+
* \param device - Device to use for OpenGL interoperability
|
| 302 |
+
*
|
| 303 |
+
* \return
|
| 304 |
+
* ::cudaSuccess,
|
| 305 |
+
* ::cudaErrorInvalidDevice,
|
| 306 |
+
* ::cudaErrorSetOnActiveProcess
|
| 307 |
+
* \notefnerr
|
| 308 |
+
*
|
| 309 |
+
* \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
|
| 310 |
+
*/
|
| 311 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
|
| 312 |
+
|
| 313 |
+
/**
|
| 314 |
+
* \brief Registers a buffer object for access by CUDA
|
| 315 |
+
*
|
| 316 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 317 |
+
*
|
| 318 |
+
* Registers the buffer object of ID \p bufObj for access by
|
| 319 |
+
* CUDA. This function must be called before CUDA can map the buffer
|
| 320 |
+
* object. The OpenGL context used to create the buffer, or another
|
| 321 |
+
* context from the same share group, must be bound to the current
|
| 322 |
+
* thread when this is called.
|
| 323 |
+
*
|
| 324 |
+
* \param bufObj - Buffer object ID to register
|
| 325 |
+
*
|
| 326 |
+
* \return
|
| 327 |
+
* ::cudaSuccess,
|
| 328 |
+
* ::cudaErrorInitializationError
|
| 329 |
+
* \notefnerr
|
| 330 |
+
*
|
| 331 |
+
* \sa ::cudaGraphicsGLRegisterBuffer
|
| 332 |
+
*/
|
| 333 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
|
| 334 |
+
|
| 335 |
+
/**
|
| 336 |
+
* \brief Maps a buffer object for access by CUDA
|
| 337 |
+
*
|
| 338 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 339 |
+
*
|
| 340 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 341 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 342 |
+
* mapping. The buffer must have previously been registered by
|
| 343 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 344 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 345 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 346 |
+
* the buffer, or another context from the same share group, must be
|
| 347 |
+
* bound to the current thread when this is called.
|
| 348 |
+
*
|
| 349 |
+
* All streams in the current thread are synchronized with the current
|
| 350 |
+
* GL context.
|
| 351 |
+
*
|
| 352 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 353 |
+
* \param bufObj - Buffer object ID to map
|
| 354 |
+
*
|
| 355 |
+
* \return
|
| 356 |
+
* ::cudaSuccess,
|
| 357 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 358 |
+
* \notefnerr
|
| 359 |
+
*
|
| 360 |
+
* \sa ::cudaGraphicsMapResources
|
| 361 |
+
*/
|
| 362 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
|
| 363 |
+
|
| 364 |
+
/**
|
| 365 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 366 |
+
*
|
| 367 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 368 |
+
*
|
| 369 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 370 |
+
* a buffer is unmapped, the base address returned by
|
| 371 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 372 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 373 |
+
* to create the buffer, or another context from the same share group,
|
| 374 |
+
* must be bound to the current thread when this is called.
|
| 375 |
+
*
|
| 376 |
+
* All streams in the current thread are synchronized with the current
|
| 377 |
+
* GL context.
|
| 378 |
+
*
|
| 379 |
+
* \param bufObj - Buffer object to unmap
|
| 380 |
+
*
|
| 381 |
+
* \return
|
| 382 |
+
* ::cudaSuccess,
|
| 383 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 384 |
+
* \notefnerr
|
| 385 |
+
*
|
| 386 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 387 |
+
*/
|
| 388 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
|
| 389 |
+
|
| 390 |
+
/**
|
| 391 |
+
* \brief Unregisters a buffer object for access by CUDA
|
| 392 |
+
*
|
| 393 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 394 |
+
*
|
| 395 |
+
* Unregisters the buffer object of ID \p bufObj for access by CUDA
|
| 396 |
+
* and releases any CUDA resources associated with the buffer. Once a
|
| 397 |
+
* buffer is unregistered, it may no longer be mapped by CUDA. The GL
|
| 398 |
+
* context used to create the buffer, or another context from the
|
| 399 |
+
* same share group, must be bound to the current thread when this is
|
| 400 |
+
* called.
|
| 401 |
+
*
|
| 402 |
+
* \param bufObj - Buffer object to unregister
|
| 403 |
+
*
|
| 404 |
+
* \return
|
| 405 |
+
* ::cudaSuccess
|
| 406 |
+
* \notefnerr
|
| 407 |
+
*
|
| 408 |
+
* \sa ::cudaGraphicsUnregisterResource
|
| 409 |
+
*/
|
| 410 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
|
| 411 |
+
|
| 412 |
+
/**
|
| 413 |
+
* \brief Set usage flags for mapping an OpenGL buffer
|
| 414 |
+
*
|
| 415 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 416 |
+
*
|
| 417 |
+
* Set flags for mapping the OpenGL buffer \p bufObj
|
| 418 |
+
*
|
| 419 |
+
* Changes to flags will take effect the next time \p bufObj is mapped.
|
| 420 |
+
* The \p flags argument may be any of the following:
|
| 421 |
+
*
|
| 422 |
+
* - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
|
| 423 |
+
* be used. It is therefore assumed that this buffer will be read from and
|
| 424 |
+
* written to by CUDA kernels. This is the default value.
|
| 425 |
+
* - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
|
| 426 |
+
* buffer will not write to the buffer.
|
| 427 |
+
* - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
|
| 428 |
+
* this buffer will not read from the buffer and will write over the
|
| 429 |
+
* entire contents of the buffer, so none of the data previously stored in
|
| 430 |
+
* the buffer will be preserved.
|
| 431 |
+
*
|
| 432 |
+
* If \p bufObj has not been registered for use with CUDA, then
|
| 433 |
+
* ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
|
| 434 |
+
* mapped for access by CUDA, then ::cudaErrorUnknown is returned.
|
| 435 |
+
*
|
| 436 |
+
* \param bufObj - Registered buffer object to set flags for
|
| 437 |
+
* \param flags - Parameters for buffer mapping
|
| 438 |
+
*
|
| 439 |
+
* \return
|
| 440 |
+
* ::cudaSuccess,
|
| 441 |
+
* ::cudaErrorInvalidValue,
|
| 442 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 443 |
+
* ::cudaErrorUnknown
|
| 444 |
+
* \notefnerr
|
| 445 |
+
*
|
| 446 |
+
* \sa ::cudaGraphicsResourceSetMapFlags
|
| 447 |
+
*/
|
| 448 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* \brief Maps a buffer object for access by CUDA
|
| 452 |
+
*
|
| 453 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 454 |
+
*
|
| 455 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 456 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 457 |
+
* mapping. The buffer must have previously been registered by
|
| 458 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 459 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 460 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 461 |
+
* the buffer, or another context from the same share group, must be
|
| 462 |
+
* bound to the current thread when this is called.
|
| 463 |
+
*
|
| 464 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 465 |
+
*
|
| 466 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 467 |
+
* \param bufObj - Buffer object ID to map
|
| 468 |
+
* \param stream - Stream to synchronize
|
| 469 |
+
*
|
| 470 |
+
* \return
|
| 471 |
+
* ::cudaSuccess,
|
| 472 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 473 |
+
* \notefnerr
|
| 474 |
+
*
|
| 475 |
+
* \sa ::cudaGraphicsMapResources
|
| 476 |
+
*/
|
| 477 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
|
| 478 |
+
|
| 479 |
+
/**
|
| 480 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 481 |
+
*
|
| 482 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 483 |
+
*
|
| 484 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 485 |
+
* a buffer is unmapped, the base address returned by
|
| 486 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 487 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 488 |
+
* to create the buffer, or another context from the same share group,
|
| 489 |
+
* must be bound to the current thread when this is called.
|
| 490 |
+
*
|
| 491 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 492 |
+
*
|
| 493 |
+
* \param bufObj - Buffer object to unmap
|
| 494 |
+
* \param stream - Stream to synchronize
|
| 495 |
+
*
|
| 496 |
+
* \return
|
| 497 |
+
* ::cudaSuccess,
|
| 498 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 499 |
+
* \notefnerr
|
| 500 |
+
*
|
| 501 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 502 |
+
*/
|
| 503 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
|
| 504 |
+
|
| 505 |
+
/** @} */ /* END CUDART_OPENGL_DEPRECATED */
|
| 506 |
+
|
| 507 |
+
#if defined(__cplusplus)
|
| 508 |
+
}
|
| 509 |
+
#endif /* __cplusplus */
|
| 510 |
+
|
| 511 |
+
#undef __CUDA_DEPRECATED
|
| 512 |
+
|
| 513 |
+
#endif /* __CUDA_GL_INTEROP_H__ */
|
| 514 |
+
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h
ADDED
|
@@ -0,0 +1,1958 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/**
|
| 51 |
+
* CUDA Occupancy Calculator
|
| 52 |
+
*
|
| 53 |
+
* NAME
|
| 54 |
+
*
|
| 55 |
+
* cudaOccMaxActiveBlocksPerMultiprocessor,
|
| 56 |
+
* cudaOccMaxPotentialOccupancyBlockSize,
|
| 57 |
+
* cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
|
| 58 |
+
* cudaOccAvailableDynamicSMemPerBlock
|
| 59 |
+
*
|
| 60 |
+
* DESCRIPTION
|
| 61 |
+
*
|
| 62 |
+
* The CUDA occupancy calculator provides a standalone, programmatical
|
| 63 |
+
* interface to compute the occupancy of a function on a device. It can also
|
| 64 |
+
* provide occupancy-oriented launch configuration suggestions.
|
| 65 |
+
*
|
| 66 |
+
* The function and device are defined by the user through
|
| 67 |
+
* cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
|
| 68 |
+
* structures. All APIs require all 3 of them.
|
| 69 |
+
*
|
| 70 |
+
* See the structure definition for more details about the device / function
|
| 71 |
+
* descriptors.
|
| 72 |
+
*
|
| 73 |
+
* See each API's prototype for API usage.
|
| 74 |
+
*
|
| 75 |
+
* COMPATIBILITY
|
| 76 |
+
*
|
| 77 |
+
* The occupancy calculator will be updated on each major CUDA toolkit
|
| 78 |
+
* release. It does not provide forward compatibility, i.e. new hardwares
|
| 79 |
+
* released after this implementation's release will not be supported.
|
| 80 |
+
*
|
| 81 |
+
* NOTE
|
| 82 |
+
*
|
| 83 |
+
* If there is access to CUDA runtime, and the sole intent is to calculate
|
| 84 |
+
* occupancy related values on one of the accessible CUDA devices, using CUDA
|
| 85 |
+
* runtime's occupancy calculation APIs is recommended.
|
| 86 |
+
*
|
| 87 |
+
*/
|
| 88 |
+
|
| 89 |
+
#ifndef __cuda_occupancy_h__
|
| 90 |
+
#define __cuda_occupancy_h__
|
| 91 |
+
|
| 92 |
+
#include <stddef.h>
|
| 93 |
+
#include <limits.h>
|
| 94 |
+
#include <string.h>
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
// __OCC_INLINE will be undefined at the end of this header
|
| 98 |
+
//
|
| 99 |
+
#ifdef __CUDACC__
|
| 100 |
+
#define __OCC_INLINE inline __host__ __device__
|
| 101 |
+
#elif defined _MSC_VER
|
| 102 |
+
#define __OCC_INLINE __inline
|
| 103 |
+
#else // GNUCC assumed
|
| 104 |
+
#define __OCC_INLINE inline
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
enum cudaOccError_enum {
|
| 108 |
+
CUDA_OCC_SUCCESS = 0, // no error encountered
|
| 109 |
+
CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid
|
| 110 |
+
CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in
|
| 111 |
+
// current implementation or device is
|
| 112 |
+
// invalid
|
| 113 |
+
};
|
| 114 |
+
typedef enum cudaOccError_enum cudaOccError;
|
| 115 |
+
|
| 116 |
+
typedef struct cudaOccResult cudaOccResult;
|
| 117 |
+
typedef struct cudaOccDeviceProp cudaOccDeviceProp;
|
| 118 |
+
typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
|
| 119 |
+
typedef struct cudaOccDeviceState cudaOccDeviceState;
|
| 120 |
+
|
| 121 |
+
/**
|
| 122 |
+
* The CUDA occupancy calculator computes the occupancy of the function
|
| 123 |
+
* described by attributes with the given block size (blockSize), static device
|
| 124 |
+
* properties (properties), dynamic device states (states) and per-block dynamic
|
| 125 |
+
* shared memory allocation (dynamicSMemSize) in bytes, and output it through
|
| 126 |
+
* result along with other useful information. The occupancy is computed in
|
| 127 |
+
* terms of the maximum number of active blocks per multiprocessor. The user can
|
| 128 |
+
* then convert it to other metrics, such as number of active warps.
|
| 129 |
+
*
|
| 130 |
+
* RETURN VALUE
|
| 131 |
+
*
|
| 132 |
+
* The occupancy and related information is returned through result.
|
| 133 |
+
*
|
| 134 |
+
* If result->activeBlocksPerMultiprocessor is 0, then the given parameter
|
| 135 |
+
* combination cannot run on the device.
|
| 136 |
+
*
|
| 137 |
+
* ERRORS
|
| 138 |
+
*
|
| 139 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 140 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 141 |
+
* current implementation or device is invalid
|
| 142 |
+
*/
|
| 143 |
+
static __OCC_INLINE
|
| 144 |
+
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 145 |
+
cudaOccResult *result, // out
|
| 146 |
+
const cudaOccDeviceProp *properties, // in
|
| 147 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 148 |
+
const cudaOccDeviceState *state, // in
|
| 149 |
+
int blockSize, // in
|
| 150 |
+
size_t dynamicSmemSize); // in
|
| 151 |
+
|
| 152 |
+
/**
|
| 153 |
+
* The CUDA launch configurator C API suggests a grid / block size pair (in
|
| 154 |
+
* minGridSize and blockSize) that achieves the best potential occupancy
|
| 155 |
+
* (i.e. maximum number of active warps with the smallest number of blocks) for
|
| 156 |
+
* the given function described by attributes, on a device described by
|
| 157 |
+
* properties with settings in state.
|
| 158 |
+
*
|
| 159 |
+
* If per-block dynamic shared memory allocation is not needed, the user should
|
| 160 |
+
* leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
|
| 161 |
+
*
|
| 162 |
+
* If per-block dynamic shared memory allocation is needed, then if the dynamic
|
| 163 |
+
* shared memory size is constant regardless of block size, the size should be
|
| 164 |
+
* passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
|
| 165 |
+
* NULL.
|
| 166 |
+
*
|
| 167 |
+
* Otherwise, if the per-block dynamic shared memory size varies with different
|
| 168 |
+
* block sizes, the user needs to provide a pointer to an unary function through
|
| 169 |
+
* blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
|
| 170 |
+
* a block of the function for any given block size. dynamicSMemSize is
|
| 171 |
+
* ignored. An example signature is:
|
| 172 |
+
*
|
| 173 |
+
* // Take block size, returns dynamic shared memory needed
|
| 174 |
+
* size_t blockToSmem(int blockSize);
|
| 175 |
+
*
|
| 176 |
+
* RETURN VALUE
|
| 177 |
+
*
|
| 178 |
+
* The suggested block size and the minimum number of blocks needed to achieve
|
| 179 |
+
* the maximum occupancy are returned through blockSize and minGridSize.
|
| 180 |
+
*
|
| 181 |
+
* If *blockSize is 0, then the given combination cannot run on the device.
|
| 182 |
+
*
|
| 183 |
+
* ERRORS
|
| 184 |
+
*
|
| 185 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 186 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 187 |
+
* current implementation or device is invalid
|
| 188 |
+
*
|
| 189 |
+
*/
|
| 190 |
+
static __OCC_INLINE
|
| 191 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 192 |
+
int *minGridSize, // out
|
| 193 |
+
int *blockSize, // out
|
| 194 |
+
const cudaOccDeviceProp *properties, // in
|
| 195 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 196 |
+
const cudaOccDeviceState *state, // in
|
| 197 |
+
size_t (*blockSizeToDynamicSMemSize)(int), // in
|
| 198 |
+
size_t dynamicSMemSize); // in
|
| 199 |
+
|
| 200 |
+
/**
|
| 201 |
+
* The CUDA launch configurator C++ API suggests a grid / block size pair (in
|
| 202 |
+
* minGridSize and blockSize) that achieves the best potential occupancy
|
| 203 |
+
* (i.e. the maximum number of active warps with the smallest number of blocks)
|
| 204 |
+
* for the given function described by attributes, on a device described by
|
| 205 |
+
* properties with settings in state.
|
| 206 |
+
*
|
| 207 |
+
* If per-block dynamic shared memory allocation is 0 or constant regardless of
|
| 208 |
+
* block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
|
| 209 |
+
* configure the launch. A constant dynamic shared memory allocation size in
|
| 210 |
+
* bytes can be passed through dynamicSMemSize.
|
| 211 |
+
*
|
| 212 |
+
* Otherwise, if the per-block dynamic shared memory size varies with different
|
| 213 |
+
* block sizes, the user needs to use
|
| 214 |
+
* cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
|
| 215 |
+
* functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
|
| 216 |
+
* computes the dynamic shared memory needed by func for any given block
|
| 217 |
+
* size. An example signature is:
|
| 218 |
+
*
|
| 219 |
+
* // Take block size, returns per-block dynamic shared memory needed
|
| 220 |
+
* size_t blockToSmem(int blockSize);
|
| 221 |
+
*
|
| 222 |
+
* RETURN VALUE
|
| 223 |
+
*
|
| 224 |
+
* The suggested block size and the minimum number of blocks needed to achieve
|
| 225 |
+
* the maximum occupancy are returned through blockSize and minGridSize.
|
| 226 |
+
*
|
| 227 |
+
* If *blockSize is 0, then the given combination cannot run on the device.
|
| 228 |
+
*
|
| 229 |
+
* ERRORS
|
| 230 |
+
*
|
| 231 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 232 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 233 |
+
* current implementation or device is invalid
|
| 234 |
+
*
|
| 235 |
+
*/
|
| 236 |
+
|
| 237 |
+
#if defined(__cplusplus)
|
| 238 |
+
namespace {
|
| 239 |
+
|
| 240 |
+
__OCC_INLINE
|
| 241 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 242 |
+
int *minGridSize, // out
|
| 243 |
+
int *blockSize, // out
|
| 244 |
+
const cudaOccDeviceProp *properties, // in
|
| 245 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 246 |
+
const cudaOccDeviceState *state, // in
|
| 247 |
+
size_t dynamicSMemSize = 0); // in
|
| 248 |
+
|
| 249 |
+
template <typename UnaryFunction>
|
| 250 |
+
__OCC_INLINE
|
| 251 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
|
| 252 |
+
int *minGridSize, // out
|
| 253 |
+
int *blockSize, // out
|
| 254 |
+
const cudaOccDeviceProp *properties, // in
|
| 255 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 256 |
+
const cudaOccDeviceState *state, // in
|
| 257 |
+
UnaryFunction blockSizeToDynamicSMemSize); // in
|
| 258 |
+
|
| 259 |
+
} // namespace anonymous
|
| 260 |
+
#endif // defined(__cplusplus)
|
| 261 |
+
|
| 262 |
+
/**
|
| 263 |
+
*
|
| 264 |
+
* The CUDA dynamic shared memory calculator computes the maximum size of
|
| 265 |
+
* per-block dynamic shared memory if we want to place numBlocks blocks
|
| 266 |
+
* on an SM.
|
| 267 |
+
*
|
| 268 |
+
* RETURN VALUE
|
| 269 |
+
*
|
| 270 |
+
* Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
|
| 271 |
+
* numBlocks blocks per SM.
|
| 272 |
+
*
|
| 273 |
+
* ERRORS
|
| 274 |
+
*
|
| 275 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 276 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 277 |
+
* current implementation or device is invalid
|
| 278 |
+
*
|
| 279 |
+
*/
|
| 280 |
+
static __OCC_INLINE
|
| 281 |
+
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
|
| 282 |
+
size_t *dynamicSmemSize,
|
| 283 |
+
const cudaOccDeviceProp *properties,
|
| 284 |
+
const cudaOccFuncAttributes *attributes,
|
| 285 |
+
const cudaOccDeviceState *state,
|
| 286 |
+
int numBlocks,
|
| 287 |
+
int blockSize);
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* Data structures
|
| 291 |
+
*
|
| 292 |
+
* These structures are subject to change for future architecture and CUDA
|
| 293 |
+
* releases. C users should initialize the structure as {0}.
|
| 294 |
+
*
|
| 295 |
+
*/
|
| 296 |
+
|
| 297 |
+
/**
|
| 298 |
+
* Device descriptor
|
| 299 |
+
*
|
| 300 |
+
* This structure describes a device.
|
| 301 |
+
*/
|
| 302 |
+
struct cudaOccDeviceProp {
|
| 303 |
+
int computeMajor; // Compute capability major version
|
| 304 |
+
int computeMinor; // Compute capability minor
|
| 305 |
+
// version. None supported minor version
|
| 306 |
+
// may cause error
|
| 307 |
+
int maxThreadsPerBlock; // Maximum number of threads per block
|
| 308 |
+
int maxThreadsPerMultiprocessor; // Maximum number of threads per SM
|
| 309 |
+
// i.e. (Max. number of warps) x (warp
|
| 310 |
+
// size)
|
| 311 |
+
int regsPerBlock; // Maximum number of registers per block
|
| 312 |
+
int regsPerMultiprocessor; // Maximum number of registers per SM
|
| 313 |
+
int warpSize; // Warp size
|
| 314 |
+
size_t sharedMemPerBlock; // Maximum shared memory size per block
|
| 315 |
+
size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM
|
| 316 |
+
int numSms; // Number of SMs available
|
| 317 |
+
size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block
|
| 318 |
+
size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver
|
| 319 |
+
|
| 320 |
+
#ifdef __cplusplus
|
| 321 |
+
// This structure can be converted from a cudaDeviceProp structure for users
|
| 322 |
+
// that use this header in their CUDA applications.
|
| 323 |
+
//
|
| 324 |
+
// If the application have access to the CUDA Runtime API, the application
|
| 325 |
+
// can obtain the device properties of a CUDA device through
|
| 326 |
+
// cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
|
| 327 |
+
// cudaDeviceProp structure.
|
| 328 |
+
//
|
| 329 |
+
// Example:
|
| 330 |
+
/*
|
| 331 |
+
{
|
| 332 |
+
cudaDeviceProp prop;
|
| 333 |
+
|
| 334 |
+
cudaGetDeviceProperties(&prop, ...);
|
| 335 |
+
|
| 336 |
+
cudaOccDeviceProp occProp = prop;
|
| 337 |
+
|
| 338 |
+
...
|
| 339 |
+
|
| 340 |
+
cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
|
| 341 |
+
}
|
| 342 |
+
*/
|
| 343 |
+
//
|
| 344 |
+
template<typename DeviceProp>
|
| 345 |
+
__OCC_INLINE
|
| 346 |
+
cudaOccDeviceProp(const DeviceProp &props)
|
| 347 |
+
: computeMajor (props.major),
|
| 348 |
+
computeMinor (props.minor),
|
| 349 |
+
maxThreadsPerBlock (props.maxThreadsPerBlock),
|
| 350 |
+
maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
|
| 351 |
+
regsPerBlock (props.regsPerBlock),
|
| 352 |
+
regsPerMultiprocessor (props.regsPerMultiprocessor),
|
| 353 |
+
warpSize (props.warpSize),
|
| 354 |
+
sharedMemPerBlock (props.sharedMemPerBlock),
|
| 355 |
+
sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor),
|
| 356 |
+
numSms (props.multiProcessorCount),
|
| 357 |
+
sharedMemPerBlockOptin (props.sharedMemPerBlockOptin),
|
| 358 |
+
reservedSharedMemPerBlock (props.reservedSharedMemPerBlock)
|
| 359 |
+
{}
|
| 360 |
+
|
| 361 |
+
__OCC_INLINE
|
| 362 |
+
cudaOccDeviceProp()
|
| 363 |
+
: computeMajor (0),
|
| 364 |
+
computeMinor (0),
|
| 365 |
+
maxThreadsPerBlock (0),
|
| 366 |
+
maxThreadsPerMultiprocessor (0),
|
| 367 |
+
regsPerBlock (0),
|
| 368 |
+
regsPerMultiprocessor (0),
|
| 369 |
+
warpSize (0),
|
| 370 |
+
sharedMemPerBlock (0),
|
| 371 |
+
sharedMemPerMultiprocessor (0),
|
| 372 |
+
numSms (0),
|
| 373 |
+
sharedMemPerBlockOptin (0),
|
| 374 |
+
reservedSharedMemPerBlock (0)
|
| 375 |
+
{}
|
| 376 |
+
#endif // __cplusplus
|
| 377 |
+
};
|
| 378 |
+
|
| 379 |
+
/**
|
| 380 |
+
* Partitioned global caching option
|
| 381 |
+
*/
|
| 382 |
+
typedef enum cudaOccPartitionedGCConfig_enum {
|
| 383 |
+
PARTITIONED_GC_OFF, // Disable partitioned global caching
|
| 384 |
+
PARTITIONED_GC_ON, // Prefer partitioned global caching
|
| 385 |
+
PARTITIONED_GC_ON_STRICT // Force partitioned global caching
|
| 386 |
+
} cudaOccPartitionedGCConfig;
|
| 387 |
+
|
| 388 |
+
/**
|
| 389 |
+
* Per function opt in maximum dynamic shared memory limit
|
| 390 |
+
*/
|
| 391 |
+
typedef enum cudaOccFuncShmemConfig_enum {
|
| 392 |
+
FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit
|
| 393 |
+
FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit
|
| 394 |
+
} cudaOccFuncShmemConfig;
|
| 395 |
+
|
| 396 |
+
/**
|
| 397 |
+
* Function descriptor
|
| 398 |
+
*
|
| 399 |
+
* This structure describes a CUDA function.
|
| 400 |
+
*/
|
| 401 |
+
struct cudaOccFuncAttributes {
|
| 402 |
+
int maxThreadsPerBlock; // Maximum block size the function can work with. If
|
| 403 |
+
// unlimited, use INT_MAX or any value greater than
|
| 404 |
+
// or equal to maxThreadsPerBlock of the device
|
| 405 |
+
int numRegs; // Number of registers used. When the function is
|
| 406 |
+
// launched on device, the register count may change
|
| 407 |
+
// due to internal tools requirements.
|
| 408 |
+
size_t sharedSizeBytes; // Number of static shared memory used
|
| 409 |
+
|
| 410 |
+
cudaOccPartitionedGCConfig partitionedGCConfig;
|
| 411 |
+
// Partitioned global caching is required to enable
|
| 412 |
+
// caching on certain chips, such as sm_52
|
| 413 |
+
// devices. Partitioned global caching can be
|
| 414 |
+
// automatically disabled if the occupancy
|
| 415 |
+
// requirement of the launch cannot support caching.
|
| 416 |
+
//
|
| 417 |
+
// To override this behavior with caching on and
|
| 418 |
+
// calculate occupancy strictly according to the
|
| 419 |
+
// preference, set partitionedGCConfig to
|
| 420 |
+
// PARTITIONED_GC_ON_STRICT. This is especially
|
| 421 |
+
// useful for experimenting and finding launch
|
| 422 |
+
// configurations (MaxPotentialOccupancyBlockSize)
|
| 423 |
+
// that allow global caching to take effect.
|
| 424 |
+
//
|
| 425 |
+
// This flag only affects the occupancy calculation.
|
| 426 |
+
|
| 427 |
+
cudaOccFuncShmemConfig shmemLimitConfig;
|
| 428 |
+
// Certain chips like sm_70 allow a user to opt into
|
| 429 |
+
// a higher per block limit of dynamic shared memory
|
| 430 |
+
// This optin is performed on a per function basis
|
| 431 |
+
// using the cuFuncSetAttribute function
|
| 432 |
+
|
| 433 |
+
size_t maxDynamicSharedSizeBytes;
|
| 434 |
+
// User set limit on maximum dynamic shared memory
|
| 435 |
+
// usable by the kernel
|
| 436 |
+
// This limit is set using the cuFuncSetAttribute
|
| 437 |
+
// function.
|
| 438 |
+
|
| 439 |
+
int numBlockBarriers; // Number of block barriers used (default to 1)
|
| 440 |
+
#ifdef __cplusplus
|
| 441 |
+
// This structure can be converted from a cudaFuncAttributes structure for
|
| 442 |
+
// users that use this header in their CUDA applications.
|
| 443 |
+
//
|
| 444 |
+
// If the application have access to the CUDA Runtime API, the application
|
| 445 |
+
// can obtain the function attributes of a CUDA kernel function through
|
| 446 |
+
// cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
|
| 447 |
+
// cudaFuncAttributes structure.
|
| 448 |
+
//
|
| 449 |
+
// Example:
|
| 450 |
+
/*
|
| 451 |
+
__global__ void foo() {...}
|
| 452 |
+
|
| 453 |
+
...
|
| 454 |
+
|
| 455 |
+
{
|
| 456 |
+
cudaFuncAttributes attr;
|
| 457 |
+
|
| 458 |
+
cudaFuncGetAttributes(&attr, foo);
|
| 459 |
+
|
| 460 |
+
cudaOccFuncAttributes occAttr = attr;
|
| 461 |
+
|
| 462 |
+
...
|
| 463 |
+
|
| 464 |
+
cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
|
| 465 |
+
}
|
| 466 |
+
*/
|
| 467 |
+
//
|
| 468 |
+
template<typename FuncAttributes>
|
| 469 |
+
__OCC_INLINE
|
| 470 |
+
cudaOccFuncAttributes(const FuncAttributes &attr)
|
| 471 |
+
: maxThreadsPerBlock (attr.maxThreadsPerBlock),
|
| 472 |
+
numRegs (attr.numRegs),
|
| 473 |
+
sharedSizeBytes (attr.sharedSizeBytes),
|
| 474 |
+
partitionedGCConfig (PARTITIONED_GC_OFF),
|
| 475 |
+
shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN),
|
| 476 |
+
maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
|
| 477 |
+
numBlockBarriers (1)
|
| 478 |
+
{}
|
| 479 |
+
|
| 480 |
+
__OCC_INLINE
|
| 481 |
+
cudaOccFuncAttributes()
|
| 482 |
+
: maxThreadsPerBlock (0),
|
| 483 |
+
numRegs (0),
|
| 484 |
+
sharedSizeBytes (0),
|
| 485 |
+
partitionedGCConfig (PARTITIONED_GC_OFF),
|
| 486 |
+
shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT),
|
| 487 |
+
maxDynamicSharedSizeBytes (0),
|
| 488 |
+
numBlockBarriers (0)
|
| 489 |
+
{}
|
| 490 |
+
#endif
|
| 491 |
+
};
|
| 492 |
+
|
| 493 |
+
typedef enum cudaOccCacheConfig_enum {
|
| 494 |
+
CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default)
|
| 495 |
+
CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
|
| 496 |
+
CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory
|
| 497 |
+
CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory
|
| 498 |
+
} cudaOccCacheConfig;
|
| 499 |
+
|
| 500 |
+
typedef enum cudaOccCarveoutConfig_enum {
|
| 501 |
+
SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default)
|
| 502 |
+
SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache
|
| 503 |
+
SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory
|
| 504 |
+
SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache
|
| 505 |
+
} cudaOccCarveoutConfig;
|
| 506 |
+
|
| 507 |
+
/**
|
| 508 |
+
* Device state descriptor
|
| 509 |
+
*
|
| 510 |
+
* This structure describes device settings that affect occupancy calculation.
|
| 511 |
+
*/
|
| 512 |
+
struct cudaOccDeviceState
|
| 513 |
+
{
|
| 514 |
+
// Cache / shared memory split preference. Deprecated on Volta
|
| 515 |
+
cudaOccCacheConfig cacheConfig;
|
| 516 |
+
// Shared memory / L1 split preference. Supported on only Volta
|
| 517 |
+
int carveoutConfig;
|
| 518 |
+
|
| 519 |
+
#ifdef __cplusplus
|
| 520 |
+
__OCC_INLINE
|
| 521 |
+
cudaOccDeviceState()
|
| 522 |
+
: cacheConfig (CACHE_PREFER_NONE),
|
| 523 |
+
carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT)
|
| 524 |
+
{}
|
| 525 |
+
#endif
|
| 526 |
+
};
|
| 527 |
+
|
| 528 |
+
typedef enum cudaOccLimitingFactor_enum {
|
| 529 |
+
// Occupancy limited due to:
|
| 530 |
+
OCC_LIMIT_WARPS = 0x01, // - warps available
|
| 531 |
+
OCC_LIMIT_REGISTERS = 0x02, // - registers available
|
| 532 |
+
OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
|
| 533 |
+
OCC_LIMIT_BLOCKS = 0x08, // - blocks available
|
| 534 |
+
OCC_LIMIT_BARRIERS = 0x10 // - barrier available
|
| 535 |
+
} cudaOccLimitingFactor;
|
| 536 |
+
|
| 537 |
+
/**
|
| 538 |
+
* Occupancy output
|
| 539 |
+
*
|
| 540 |
+
* This structure contains occupancy calculator's output.
|
| 541 |
+
*/
|
| 542 |
+
struct cudaOccResult {
|
| 543 |
+
int activeBlocksPerMultiprocessor; // Occupancy
|
| 544 |
+
unsigned int limitingFactors; // Factors that limited occupancy. A bit
|
| 545 |
+
// field that counts the limiting
|
| 546 |
+
// factors, see cudaOccLimitingFactor
|
| 547 |
+
int blockLimitRegs; // Occupancy due to register
|
| 548 |
+
// usage, INT_MAX if the kernel does not
|
| 549 |
+
// use any register.
|
| 550 |
+
int blockLimitSharedMem; // Occupancy due to shared memory
|
| 551 |
+
// usage, INT_MAX if the kernel does not
|
| 552 |
+
// use shared memory.
|
| 553 |
+
int blockLimitWarps; // Occupancy due to block size limit
|
| 554 |
+
int blockLimitBlocks; // Occupancy due to maximum number of blocks
|
| 555 |
+
// managable per SM
|
| 556 |
+
int blockLimitBarriers; // Occupancy due to block barrier usage
|
| 557 |
+
int allocatedRegistersPerBlock; // Actual number of registers allocated per
|
| 558 |
+
// block
|
| 559 |
+
size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
|
| 560 |
+
// per block
|
| 561 |
+
cudaOccPartitionedGCConfig partitionedGCConfig;
|
| 562 |
+
// Report if partitioned global caching
|
| 563 |
+
// is actually enabled.
|
| 564 |
+
};
|
| 565 |
+
|
| 566 |
+
/**
|
| 567 |
+
* Partitioned global caching support
|
| 568 |
+
*
|
| 569 |
+
* See cudaOccPartitionedGlobalCachingModeSupport
|
| 570 |
+
*/
|
| 571 |
+
typedef enum cudaOccPartitionedGCSupport_enum {
|
| 572 |
+
PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported
|
| 573 |
+
PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported
|
| 574 |
+
} cudaOccPartitionedGCSupport;
|
| 575 |
+
|
| 576 |
+
/**
|
| 577 |
+
* Implementation
|
| 578 |
+
*/
|
| 579 |
+
|
| 580 |
+
/**
|
| 581 |
+
* Max compute capability supported
|
| 582 |
+
*/
|
| 583 |
+
#define __CUDA_OCC_MAJOR__ 9
|
| 584 |
+
#define __CUDA_OCC_MINOR__ 0
|
| 585 |
+
|
| 586 |
+
//////////////////////////////////////////
|
| 587 |
+
// Mathematical Helper Functions //
|
| 588 |
+
//////////////////////////////////////////
|
| 589 |
+
|
| 590 |
+
static __OCC_INLINE int __occMin(int lhs, int rhs)
|
| 591 |
+
{
|
| 592 |
+
return rhs < lhs ? rhs : lhs;
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
static __OCC_INLINE int __occDivideRoundUp(int x, int y)
|
| 596 |
+
{
|
| 597 |
+
return (x + (y - 1)) / y;
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
static __OCC_INLINE int __occRoundUp(int x, int y)
|
| 601 |
+
{
|
| 602 |
+
return y * __occDivideRoundUp(x, y);
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
//////////////////////////////////////////
|
| 606 |
+
// Architectural Properties //
|
| 607 |
+
//////////////////////////////////////////
|
| 608 |
+
|
| 609 |
+
/**
|
| 610 |
+
* Granularity of shared memory allocation
|
| 611 |
+
*/
|
| 612 |
+
static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
|
| 613 |
+
{
|
| 614 |
+
int value;
|
| 615 |
+
|
| 616 |
+
switch(properties->computeMajor) {
|
| 617 |
+
case 3:
|
| 618 |
+
case 5:
|
| 619 |
+
case 6:
|
| 620 |
+
case 7:
|
| 621 |
+
value = 256;
|
| 622 |
+
break;
|
| 623 |
+
case 8:
|
| 624 |
+
case 9:
|
| 625 |
+
value = 128;
|
| 626 |
+
break;
|
| 627 |
+
default:
|
| 628 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
*limit = value;
|
| 632 |
+
|
| 633 |
+
return CUDA_OCC_SUCCESS;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
/**
|
| 637 |
+
* Maximum number of registers per thread
|
| 638 |
+
*/
|
| 639 |
+
static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
|
| 640 |
+
{
|
| 641 |
+
int value;
|
| 642 |
+
|
| 643 |
+
switch(properties->computeMajor) {
|
| 644 |
+
case 3:
|
| 645 |
+
case 5:
|
| 646 |
+
case 6:
|
| 647 |
+
value = 255;
|
| 648 |
+
break;
|
| 649 |
+
case 7:
|
| 650 |
+
case 8:
|
| 651 |
+
case 9:
|
| 652 |
+
value = 256;
|
| 653 |
+
break;
|
| 654 |
+
default:
|
| 655 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
*limit = value;
|
| 659 |
+
|
| 660 |
+
return CUDA_OCC_SUCCESS;
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
/**
|
| 664 |
+
* Granularity of register allocation
|
| 665 |
+
*/
|
| 666 |
+
static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
|
| 667 |
+
{
|
| 668 |
+
int value;
|
| 669 |
+
|
| 670 |
+
switch(properties->computeMajor) {
|
| 671 |
+
case 3:
|
| 672 |
+
case 5:
|
| 673 |
+
case 6:
|
| 674 |
+
case 7:
|
| 675 |
+
case 8:
|
| 676 |
+
case 9:
|
| 677 |
+
value = 256;
|
| 678 |
+
break;
|
| 679 |
+
default:
|
| 680 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
*limit = value;
|
| 684 |
+
|
| 685 |
+
return CUDA_OCC_SUCCESS;
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
/**
|
| 689 |
+
* Number of sub-partitions
|
| 690 |
+
*/
|
| 691 |
+
static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
|
| 692 |
+
{
|
| 693 |
+
int value;
|
| 694 |
+
|
| 695 |
+
switch(properties->computeMajor) {
|
| 696 |
+
case 3:
|
| 697 |
+
case 5:
|
| 698 |
+
case 7:
|
| 699 |
+
case 8:
|
| 700 |
+
case 9:
|
| 701 |
+
value = 4;
|
| 702 |
+
break;
|
| 703 |
+
case 6:
|
| 704 |
+
value = properties->computeMinor ? 4 : 2;
|
| 705 |
+
break;
|
| 706 |
+
default:
|
| 707 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
*limit = value;
|
| 711 |
+
|
| 712 |
+
return CUDA_OCC_SUCCESS;
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
/**
|
| 717 |
+
* Maximum number of blocks that can run simultaneously on a multiprocessor
|
| 718 |
+
*/
|
| 719 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
|
| 720 |
+
{
|
| 721 |
+
int value;
|
| 722 |
+
|
| 723 |
+
switch(properties->computeMajor) {
|
| 724 |
+
case 3:
|
| 725 |
+
value = 16;
|
| 726 |
+
break;
|
| 727 |
+
case 5:
|
| 728 |
+
case 6:
|
| 729 |
+
value = 32;
|
| 730 |
+
break;
|
| 731 |
+
case 7: {
|
| 732 |
+
int isTuring = properties->computeMinor == 5;
|
| 733 |
+
value = (isTuring) ? 16 : 32;
|
| 734 |
+
break;
|
| 735 |
+
}
|
| 736 |
+
case 8:
|
| 737 |
+
if (properties->computeMinor == 0) {
|
| 738 |
+
value = 32;
|
| 739 |
+
}
|
| 740 |
+
else if (properties->computeMinor == 9) {
|
| 741 |
+
value = 24;
|
| 742 |
+
}
|
| 743 |
+
else {
|
| 744 |
+
value = 16;
|
| 745 |
+
}
|
| 746 |
+
break;
|
| 747 |
+
case 9:
|
| 748 |
+
value = 32;
|
| 749 |
+
break;
|
| 750 |
+
default:
|
| 751 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 752 |
+
}
|
| 753 |
+
|
| 754 |
+
*limit = value;
|
| 755 |
+
|
| 756 |
+
return CUDA_OCC_SUCCESS;
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
/**
|
| 760 |
+
* Align up shared memory based on compute major configurations
|
| 761 |
+
*/
|
| 762 |
+
static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
|
| 763 |
+
{
|
| 764 |
+
// Volta and Turing have shared L1 cache / shared memory, and support cache
|
| 765 |
+
// configuration to trade one for the other. These values are needed to
|
| 766 |
+
// map carveout config ratio to the next available architecture size
|
| 767 |
+
size_t size = *shMemSize;
|
| 768 |
+
|
| 769 |
+
switch (properties->computeMajor) {
|
| 770 |
+
case 7: {
|
| 771 |
+
// Turing supports 32KB and 64KB shared mem.
|
| 772 |
+
int isTuring = properties->computeMinor == 5;
|
| 773 |
+
if (isTuring) {
|
| 774 |
+
if (size <= 32 * 1024) {
|
| 775 |
+
*shMemSize = 32 * 1024;
|
| 776 |
+
}
|
| 777 |
+
else if (size <= 64 * 1024) {
|
| 778 |
+
*shMemSize = 64 * 1024;
|
| 779 |
+
}
|
| 780 |
+
else {
|
| 781 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 782 |
+
}
|
| 783 |
+
}
|
| 784 |
+
// Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
|
| 785 |
+
else {
|
| 786 |
+
if (size == 0) {
|
| 787 |
+
*shMemSize = 0;
|
| 788 |
+
}
|
| 789 |
+
else if (size <= 8 * 1024) {
|
| 790 |
+
*shMemSize = 8 * 1024;
|
| 791 |
+
}
|
| 792 |
+
else if (size <= 16 * 1024) {
|
| 793 |
+
*shMemSize = 16 * 1024;
|
| 794 |
+
}
|
| 795 |
+
else if (size <= 32 * 1024) {
|
| 796 |
+
*shMemSize = 32 * 1024;
|
| 797 |
+
}
|
| 798 |
+
else if (size <= 64 * 1024) {
|
| 799 |
+
*shMemSize = 64 * 1024;
|
| 800 |
+
}
|
| 801 |
+
else if (size <= 96 * 1024) {
|
| 802 |
+
*shMemSize = 96 * 1024;
|
| 803 |
+
}
|
| 804 |
+
else {
|
| 805 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 806 |
+
}
|
| 807 |
+
}
|
| 808 |
+
break;
|
| 809 |
+
}
|
| 810 |
+
case 8:
|
| 811 |
+
if (properties->computeMinor == 0 || properties->computeMinor == 7) {
|
| 812 |
+
if (size == 0) {
|
| 813 |
+
*shMemSize = 0;
|
| 814 |
+
}
|
| 815 |
+
else if (size <= 8 * 1024) {
|
| 816 |
+
*shMemSize = 8 * 1024;
|
| 817 |
+
}
|
| 818 |
+
else if (size <= 16 * 1024) {
|
| 819 |
+
*shMemSize = 16 * 1024;
|
| 820 |
+
}
|
| 821 |
+
else if (size <= 32 * 1024) {
|
| 822 |
+
*shMemSize = 32 * 1024;
|
| 823 |
+
}
|
| 824 |
+
else if (size <= 64 * 1024) {
|
| 825 |
+
*shMemSize = 64 * 1024;
|
| 826 |
+
}
|
| 827 |
+
else if (size <= 100 * 1024) {
|
| 828 |
+
*shMemSize = 100 * 1024;
|
| 829 |
+
}
|
| 830 |
+
else if (size <= 132 * 1024) {
|
| 831 |
+
*shMemSize = 132 * 1024;
|
| 832 |
+
}
|
| 833 |
+
else if (size <= 164 * 1024) {
|
| 834 |
+
*shMemSize = 164 * 1024;
|
| 835 |
+
}
|
| 836 |
+
else {
|
| 837 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 838 |
+
}
|
| 839 |
+
}
|
| 840 |
+
else {
|
| 841 |
+
if (size == 0) {
|
| 842 |
+
*shMemSize = 0;
|
| 843 |
+
}
|
| 844 |
+
else if (size <= 8 * 1024) {
|
| 845 |
+
*shMemSize = 8 * 1024;
|
| 846 |
+
}
|
| 847 |
+
else if (size <= 16 * 1024) {
|
| 848 |
+
*shMemSize = 16 * 1024;
|
| 849 |
+
}
|
| 850 |
+
else if (size <= 32 * 1024) {
|
| 851 |
+
*shMemSize = 32 * 1024;
|
| 852 |
+
}
|
| 853 |
+
else if (size <= 64 * 1024) {
|
| 854 |
+
*shMemSize = 64 * 1024;
|
| 855 |
+
}
|
| 856 |
+
else if (size <= 100 * 1024) {
|
| 857 |
+
*shMemSize = 100 * 1024;
|
| 858 |
+
}
|
| 859 |
+
else {
|
| 860 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 861 |
+
}
|
| 862 |
+
}
|
| 863 |
+
break;
|
| 864 |
+
case 9: {
|
| 865 |
+
if (size == 0) {
|
| 866 |
+
*shMemSize = 0;
|
| 867 |
+
}
|
| 868 |
+
else if (size <= 8 * 1024) {
|
| 869 |
+
*shMemSize = 8 * 1024;
|
| 870 |
+
}
|
| 871 |
+
else if (size <= 16 * 1024) {
|
| 872 |
+
*shMemSize = 16 * 1024;
|
| 873 |
+
}
|
| 874 |
+
else if (size <= 32 * 1024) {
|
| 875 |
+
*shMemSize = 32 * 1024;
|
| 876 |
+
}
|
| 877 |
+
else if (size <= 64 * 1024) {
|
| 878 |
+
*shMemSize = 64 * 1024;
|
| 879 |
+
}
|
| 880 |
+
else if (size <= 100 * 1024) {
|
| 881 |
+
*shMemSize = 100 * 1024;
|
| 882 |
+
}
|
| 883 |
+
else if (size <= 132 * 1024) {
|
| 884 |
+
*shMemSize = 132 * 1024;
|
| 885 |
+
}
|
| 886 |
+
else if (size <= 164 * 1024) {
|
| 887 |
+
*shMemSize = 164 * 1024;
|
| 888 |
+
}
|
| 889 |
+
else if (size <= 196 * 1024) {
|
| 890 |
+
*shMemSize = 196 * 1024;
|
| 891 |
+
}
|
| 892 |
+
else if (size <= 228 * 1024) {
|
| 893 |
+
*shMemSize = 228 * 1024;
|
| 894 |
+
}
|
| 895 |
+
else {
|
| 896 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 897 |
+
}
|
| 898 |
+
break;
|
| 899 |
+
}
|
| 900 |
+
default:
|
| 901 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 902 |
+
}
|
| 903 |
+
|
| 904 |
+
return CUDA_OCC_SUCCESS;
|
| 905 |
+
}
|
| 906 |
+
|
| 907 |
+
/**
|
| 908 |
+
* Shared memory based on the new carveoutConfig API introduced with Volta
|
| 909 |
+
*/
|
| 910 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 911 |
+
{
|
| 912 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 913 |
+
size_t preferenceShmemSize;
|
| 914 |
+
|
| 915 |
+
// CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
|
| 916 |
+
// devices. This preference will take precedence over the older cacheConfig setting.
|
| 917 |
+
// Map cacheConfig to its effective preference value.
|
| 918 |
+
int effectivePreference = state->carveoutConfig;
|
| 919 |
+
if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
|
| 920 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 921 |
+
}
|
| 922 |
+
|
| 923 |
+
if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
|
| 924 |
+
switch (state->cacheConfig)
|
| 925 |
+
{
|
| 926 |
+
case CACHE_PREFER_L1:
|
| 927 |
+
effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
|
| 928 |
+
break;
|
| 929 |
+
case CACHE_PREFER_SHARED:
|
| 930 |
+
effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
|
| 931 |
+
break;
|
| 932 |
+
case CACHE_PREFER_EQUAL:
|
| 933 |
+
effectivePreference = SHAREDMEM_CARVEOUT_HALF;
|
| 934 |
+
break;
|
| 935 |
+
default:
|
| 936 |
+
effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
|
| 937 |
+
break;
|
| 938 |
+
}
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
|
| 942 |
+
preferenceShmemSize = properties->sharedMemPerMultiprocessor;
|
| 943 |
+
}
|
| 944 |
+
else {
|
| 945 |
+
preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
|
| 946 |
+
}
|
| 947 |
+
|
| 948 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
|
| 949 |
+
*limit = preferenceShmemSize;
|
| 950 |
+
return status;
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
/**
|
| 954 |
+
* Shared memory based on the cacheConfig
|
| 955 |
+
*/
|
| 956 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 957 |
+
{
|
| 958 |
+
size_t bytes = 0;
|
| 959 |
+
size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
|
| 960 |
+
cudaOccCacheConfig cacheConfig = state->cacheConfig;
|
| 961 |
+
|
| 962 |
+
// Kepler has shared L1 cache / shared memory, and support cache
|
| 963 |
+
// configuration to trade one for the other. These values are needed to
|
| 964 |
+
// calculate the correct shared memory size for user requested cache
|
| 965 |
+
// configuration.
|
| 966 |
+
//
|
| 967 |
+
size_t minCacheSize = 16384;
|
| 968 |
+
size_t maxCacheSize = 49152;
|
| 969 |
+
size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
|
| 970 |
+
size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;
|
| 971 |
+
|
| 972 |
+
switch (properties->computeMajor) {
|
| 973 |
+
case 3:
|
| 974 |
+
// Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
|
| 975 |
+
// is shared memory.
|
| 976 |
+
//
|
| 977 |
+
switch (cacheConfig) {
|
| 978 |
+
default :
|
| 979 |
+
case CACHE_PREFER_NONE:
|
| 980 |
+
case CACHE_PREFER_SHARED:
|
| 981 |
+
bytes = sharedMemPerMultiprocessorHigh;
|
| 982 |
+
break;
|
| 983 |
+
case CACHE_PREFER_L1:
|
| 984 |
+
bytes = sharedMemPerMultiprocessorLow;
|
| 985 |
+
break;
|
| 986 |
+
case CACHE_PREFER_EQUAL:
|
| 987 |
+
// Equal is the mid-point between high and low. It should be
|
| 988 |
+
// equivalent to low + 16KB.
|
| 989 |
+
//
|
| 990 |
+
bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
|
| 991 |
+
break;
|
| 992 |
+
}
|
| 993 |
+
break;
|
| 994 |
+
case 5:
|
| 995 |
+
case 6:
|
| 996 |
+
// Maxwell and Pascal have dedicated shared memory.
|
| 997 |
+
//
|
| 998 |
+
bytes = sharedMemPerMultiprocessorHigh;
|
| 999 |
+
break;
|
| 1000 |
+
default:
|
| 1001 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 1002 |
+
}
|
| 1003 |
+
|
| 1004 |
+
*limit = bytes;
|
| 1005 |
+
|
| 1006 |
+
return CUDA_OCC_SUCCESS;
|
| 1007 |
+
}
|
| 1008 |
+
|
| 1009 |
+
/**
|
| 1010 |
+
* Shared memory based on config requested by User
|
| 1011 |
+
*/
|
| 1012 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 1013 |
+
{
|
| 1014 |
+
// Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
|
| 1015 |
+
// it is handled separately from the cache config preference.
|
| 1016 |
+
if (properties->computeMajor >= 7) {
|
| 1017 |
+
return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
|
| 1018 |
+
}
|
| 1019 |
+
return cudaOccSMemPreference(limit, properties, state);
|
| 1020 |
+
}
|
| 1021 |
+
|
| 1022 |
+
/**
|
| 1023 |
+
* Return the per block shared memory limit based on function config
|
| 1024 |
+
*/
|
| 1025 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
|
| 1026 |
+
{
|
| 1027 |
+
switch (properties->computeMajor) {
|
| 1028 |
+
case 2:
|
| 1029 |
+
case 3:
|
| 1030 |
+
case 4:
|
| 1031 |
+
case 5:
|
| 1032 |
+
case 6:
|
| 1033 |
+
*limit = properties->sharedMemPerBlock;
|
| 1034 |
+
break;
|
| 1035 |
+
case 7:
|
| 1036 |
+
case 8:
|
| 1037 |
+
case 9:
|
| 1038 |
+
switch (shmemLimitConfig) {
|
| 1039 |
+
default:
|
| 1040 |
+
case FUNC_SHMEM_LIMIT_DEFAULT:
|
| 1041 |
+
*limit = properties->sharedMemPerBlock;
|
| 1042 |
+
break;
|
| 1043 |
+
case FUNC_SHMEM_LIMIT_OPTIN:
|
| 1044 |
+
if (smemPerCta > properties->sharedMemPerBlock) {
|
| 1045 |
+
*limit = properties->sharedMemPerBlockOptin;
|
| 1046 |
+
}
|
| 1047 |
+
else {
|
| 1048 |
+
*limit = properties->sharedMemPerBlock;
|
| 1049 |
+
}
|
| 1050 |
+
break;
|
| 1051 |
+
}
|
| 1052 |
+
break;
|
| 1053 |
+
default:
|
| 1054 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 1055 |
+
}
|
| 1056 |
+
|
| 1057 |
+
// Starting Ampere, CUDA driver reserves additional shared memory per block
|
| 1058 |
+
if (properties->computeMajor >= 8) {
|
| 1059 |
+
*limit += properties->reservedSharedMemPerBlock;
|
| 1060 |
+
}
|
| 1061 |
+
|
| 1062 |
+
return CUDA_OCC_SUCCESS;
|
| 1063 |
+
}
|
| 1064 |
+
|
| 1065 |
+
/**
|
| 1066 |
+
* Partitioned global caching mode support
|
| 1067 |
+
*/
|
| 1068 |
+
static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
|
| 1069 |
+
{
|
| 1070 |
+
*limit = PARTITIONED_GC_NOT_SUPPORTED;
|
| 1071 |
+
|
| 1072 |
+
if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
|
| 1073 |
+
properties->computeMajor == 6) {
|
| 1074 |
+
*limit = PARTITIONED_GC_SUPPORTED;
|
| 1075 |
+
}
|
| 1076 |
+
|
| 1077 |
+
if (properties->computeMajor == 6 && properties->computeMinor == 0) {
|
| 1078 |
+
*limit = PARTITIONED_GC_NOT_SUPPORTED;
|
| 1079 |
+
}
|
| 1080 |
+
|
| 1081 |
+
return CUDA_OCC_SUCCESS;
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
///////////////////////////////////////////////
|
| 1085 |
+
// User Input Sanity //
|
| 1086 |
+
///////////////////////////////////////////////
|
| 1087 |
+
|
| 1088 |
+
static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
|
| 1089 |
+
{
|
| 1090 |
+
// Verify device properties
|
| 1091 |
+
//
|
| 1092 |
+
// Each of these limits must be a positive number.
|
| 1093 |
+
//
|
| 1094 |
+
// Compute capacity is checked during the occupancy calculation
|
| 1095 |
+
//
|
| 1096 |
+
if (properties->maxThreadsPerBlock <= 0 ||
|
| 1097 |
+
properties->maxThreadsPerMultiprocessor <= 0 ||
|
| 1098 |
+
properties->regsPerBlock <= 0 ||
|
| 1099 |
+
properties->regsPerMultiprocessor <= 0 ||
|
| 1100 |
+
properties->warpSize <= 0 ||
|
| 1101 |
+
properties->sharedMemPerBlock <= 0 ||
|
| 1102 |
+
properties->sharedMemPerMultiprocessor <= 0 ||
|
| 1103 |
+
properties->numSms <= 0) {
|
| 1104 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1105 |
+
}
|
| 1106 |
+
|
| 1107 |
+
return CUDA_OCC_SUCCESS;
|
| 1108 |
+
}
|
| 1109 |
+
|
| 1110 |
+
static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
|
| 1111 |
+
{
|
| 1112 |
+
// Verify function attributes
|
| 1113 |
+
//
|
| 1114 |
+
if (attributes->maxThreadsPerBlock <= 0 ||
|
| 1115 |
+
attributes->numRegs < 0) { // Compiler may choose not to use
|
| 1116 |
+
// any register (empty kernels,
|
| 1117 |
+
// etc.)
|
| 1118 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1119 |
+
}
|
| 1120 |
+
|
| 1121 |
+
return CUDA_OCC_SUCCESS;
|
| 1122 |
+
}
|
| 1123 |
+
|
| 1124 |
+
static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
|
| 1125 |
+
{
|
| 1126 |
+
(void)state; // silence unused-variable warning
|
| 1127 |
+
// Placeholder
|
| 1128 |
+
//
|
| 1129 |
+
|
| 1130 |
+
return CUDA_OCC_SUCCESS;
|
| 1131 |
+
}
|
| 1132 |
+
|
| 1133 |
+
static __OCC_INLINE cudaOccError cudaOccInputCheck(
|
| 1134 |
+
const cudaOccDeviceProp *properties,
|
| 1135 |
+
const cudaOccFuncAttributes *attributes,
|
| 1136 |
+
const cudaOccDeviceState *state)
|
| 1137 |
+
{
|
| 1138 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1139 |
+
|
| 1140 |
+
status = cudaOccDevicePropCheck(properties);
|
| 1141 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1142 |
+
return status;
|
| 1143 |
+
}
|
| 1144 |
+
|
| 1145 |
+
status = cudaOccFuncAttributesCheck(attributes);
|
| 1146 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1147 |
+
return status;
|
| 1148 |
+
}
|
| 1149 |
+
|
| 1150 |
+
status = cudaOccDeviceStateCheck(state);
|
| 1151 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1152 |
+
return status;
|
| 1153 |
+
}
|
| 1154 |
+
|
| 1155 |
+
return status;
|
| 1156 |
+
}
|
| 1157 |
+
|
| 1158 |
+
///////////////////////////////////////////////
|
| 1159 |
+
// Occupancy calculation Functions //
|
| 1160 |
+
///////////////////////////////////////////////
|
| 1161 |
+
|
| 1162 |
+
static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
|
| 1163 |
+
const cudaOccDeviceProp *properties,
|
| 1164 |
+
const cudaOccFuncAttributes *attributes)
|
| 1165 |
+
{
|
| 1166 |
+
cudaOccPartitionedGCSupport gcSupport;
|
| 1167 |
+
cudaOccPartitionedGCConfig gcConfig;
|
| 1168 |
+
|
| 1169 |
+
cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
|
| 1170 |
+
|
| 1171 |
+
gcConfig = attributes->partitionedGCConfig;
|
| 1172 |
+
|
| 1173 |
+
if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
|
| 1174 |
+
gcConfig = PARTITIONED_GC_OFF;
|
| 1175 |
+
}
|
| 1176 |
+
|
| 1177 |
+
return gcConfig;
|
| 1178 |
+
}
|
| 1179 |
+
|
| 1180 |
+
// Warp limit
|
| 1181 |
+
//
|
| 1182 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
|
| 1183 |
+
int *limit,
|
| 1184 |
+
cudaOccPartitionedGCConfig gcConfig,
|
| 1185 |
+
const cudaOccDeviceProp *properties,
|
| 1186 |
+
const cudaOccFuncAttributes *attributes,
|
| 1187 |
+
int blockSize)
|
| 1188 |
+
{
|
| 1189 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1190 |
+
int maxWarpsPerSm;
|
| 1191 |
+
int warpsAllocatedPerCTA;
|
| 1192 |
+
int maxBlocks;
|
| 1193 |
+
(void)attributes; // silence unused-variable warning
|
| 1194 |
+
|
| 1195 |
+
if (blockSize > properties->maxThreadsPerBlock) {
|
| 1196 |
+
maxBlocks = 0;
|
| 1197 |
+
}
|
| 1198 |
+
else {
|
| 1199 |
+
maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
|
| 1200 |
+
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
|
| 1201 |
+
maxBlocks = 0;
|
| 1202 |
+
|
| 1203 |
+
if (gcConfig != PARTITIONED_GC_OFF) {
|
| 1204 |
+
int maxBlocksPerSmPartition;
|
| 1205 |
+
int maxWarpsPerSmPartition;
|
| 1206 |
+
|
| 1207 |
+
// If partitioned global caching is on, then a CTA can only use a SM
|
| 1208 |
+
// partition (a half SM), and thus a half of the warp slots
|
| 1209 |
+
// available per SM
|
| 1210 |
+
//
|
| 1211 |
+
maxWarpsPerSmPartition = maxWarpsPerSm / 2;
|
| 1212 |
+
maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
|
| 1213 |
+
maxBlocks = maxBlocksPerSmPartition * 2;
|
| 1214 |
+
}
|
| 1215 |
+
// On hardware that supports partitioned global caching, each half SM is
|
| 1216 |
+
// guaranteed to support at least 32 warps (maximum number of warps of a
|
| 1217 |
+
// CTA), so caching will not cause 0 occupancy due to insufficient warp
|
| 1218 |
+
// allocation slots.
|
| 1219 |
+
//
|
| 1220 |
+
else {
|
| 1221 |
+
maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
|
| 1222 |
+
}
|
| 1223 |
+
}
|
| 1224 |
+
|
| 1225 |
+
*limit = maxBlocks;
|
| 1226 |
+
|
| 1227 |
+
return status;
|
| 1228 |
+
}
|
| 1229 |
+
|
| 1230 |
+
// Shared memory limit
|
| 1231 |
+
//
|
| 1232 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
|
| 1233 |
+
int *limit,
|
| 1234 |
+
cudaOccResult *result,
|
| 1235 |
+
const cudaOccDeviceProp *properties,
|
| 1236 |
+
const cudaOccFuncAttributes *attributes,
|
| 1237 |
+
const cudaOccDeviceState *state,
|
| 1238 |
+
int blockSize,
|
| 1239 |
+
size_t dynamicSmemSize)
|
| 1240 |
+
{
|
| 1241 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1242 |
+
int allocationGranularity;
|
| 1243 |
+
size_t userSmemPreference = 0;
|
| 1244 |
+
size_t totalSmemUsagePerCTA;
|
| 1245 |
+
size_t maxSmemUsagePerCTA;
|
| 1246 |
+
size_t smemAllocatedPerCTA;
|
| 1247 |
+
size_t staticSmemSize;
|
| 1248 |
+
size_t sharedMemPerMultiprocessor;
|
| 1249 |
+
size_t smemLimitPerCTA;
|
| 1250 |
+
int maxBlocks;
|
| 1251 |
+
int dynamicSmemSizeExceeded = 0;
|
| 1252 |
+
int totalSmemSizeExceeded = 0;
|
| 1253 |
+
(void)blockSize; // silence unused-variable warning
|
| 1254 |
+
|
| 1255 |
+
status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
|
| 1256 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1257 |
+
return status;
|
| 1258 |
+
}
|
| 1259 |
+
|
| 1260 |
+
// Obtain the user preferred shared memory size. This setting is ignored if
|
| 1261 |
+
// user requests more shared memory than preferred.
|
| 1262 |
+
//
|
| 1263 |
+
status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
|
| 1264 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1265 |
+
return status;
|
| 1266 |
+
}
|
| 1267 |
+
|
| 1268 |
+
staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
|
| 1269 |
+
totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
|
| 1270 |
+
smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
|
| 1271 |
+
|
| 1272 |
+
maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
|
| 1273 |
+
|
| 1274 |
+
dynamicSmemSizeExceeded = 0;
|
| 1275 |
+
totalSmemSizeExceeded = 0;
|
| 1276 |
+
|
| 1277 |
+
// Obtain the user set maximum dynamic size if it exists
|
| 1278 |
+
// If so, the current launch dynamic shared memory must not
|
| 1279 |
+
// exceed the set limit
|
| 1280 |
+
if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
|
| 1281 |
+
dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
|
| 1282 |
+
dynamicSmemSizeExceeded = 1;
|
| 1283 |
+
}
|
| 1284 |
+
|
| 1285 |
+
status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
|
| 1286 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1287 |
+
return status;
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
if (smemAllocatedPerCTA > smemLimitPerCTA) {
|
| 1291 |
+
totalSmemSizeExceeded = 1;
|
| 1292 |
+
}
|
| 1293 |
+
|
| 1294 |
+
if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
|
| 1295 |
+
maxBlocks = 0;
|
| 1296 |
+
}
|
| 1297 |
+
else {
|
| 1298 |
+
// User requested shared memory limit is used as long as it is greater
|
| 1299 |
+
// than the total shared memory used per CTA, i.e. as long as at least
|
| 1300 |
+
// one CTA can be launched.
|
| 1301 |
+
if (userSmemPreference >= smemAllocatedPerCTA) {
|
| 1302 |
+
sharedMemPerMultiprocessor = userSmemPreference;
|
| 1303 |
+
}
|
| 1304 |
+
else {
|
| 1305 |
+
// On Volta+, user requested shared memory will limit occupancy
|
| 1306 |
+
// if it's less than shared memory per CTA. Otherwise, the
|
| 1307 |
+
// maximum shared memory limit is used.
|
| 1308 |
+
if (properties->computeMajor >= 7) {
|
| 1309 |
+
sharedMemPerMultiprocessor = smemAllocatedPerCTA;
|
| 1310 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
|
| 1311 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1312 |
+
return status;
|
| 1313 |
+
}
|
| 1314 |
+
}
|
| 1315 |
+
else {
|
| 1316 |
+
sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
|
| 1317 |
+
}
|
| 1318 |
+
}
|
| 1319 |
+
|
| 1320 |
+
if (smemAllocatedPerCTA > 0) {
|
| 1321 |
+
maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
|
| 1322 |
+
}
|
| 1323 |
+
else {
|
| 1324 |
+
maxBlocks = INT_MAX;
|
| 1325 |
+
}
|
| 1326 |
+
}
|
| 1327 |
+
|
| 1328 |
+
result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
|
| 1329 |
+
|
| 1330 |
+
*limit = maxBlocks;
|
| 1331 |
+
|
| 1332 |
+
return status;
|
| 1333 |
+
}
|
| 1334 |
+
|
| 1335 |
+
static __OCC_INLINE
|
| 1336 |
+
cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
|
| 1337 |
+
int *limit,
|
| 1338 |
+
cudaOccPartitionedGCConfig *gcConfig,
|
| 1339 |
+
cudaOccResult *result,
|
| 1340 |
+
const cudaOccDeviceProp *properties,
|
| 1341 |
+
const cudaOccFuncAttributes *attributes,
|
| 1342 |
+
int blockSize)
|
| 1343 |
+
{
|
| 1344 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1345 |
+
int allocationGranularity;
|
| 1346 |
+
int warpsAllocatedPerCTA;
|
| 1347 |
+
int regsAllocatedPerCTA;
|
| 1348 |
+
int regsAssumedPerCTA;
|
| 1349 |
+
int regsPerWarp;
|
| 1350 |
+
int regsAllocatedPerWarp;
|
| 1351 |
+
int numSubPartitions;
|
| 1352 |
+
int numRegsPerSubPartition;
|
| 1353 |
+
int numWarpsPerSubPartition;
|
| 1354 |
+
int numWarpsPerSM;
|
| 1355 |
+
int maxBlocks;
|
| 1356 |
+
int maxRegsPerThread;
|
| 1357 |
+
|
| 1358 |
+
status = cudaOccRegAllocationGranularity(
|
| 1359 |
+
&allocationGranularity,
|
| 1360 |
+
properties);
|
| 1361 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1362 |
+
return status;
|
| 1363 |
+
}
|
| 1364 |
+
|
| 1365 |
+
status = cudaOccRegAllocationMaxPerThread(
|
| 1366 |
+
&maxRegsPerThread,
|
| 1367 |
+
properties);
|
| 1368 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1369 |
+
return status;
|
| 1370 |
+
}
|
| 1371 |
+
|
| 1372 |
+
status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
|
| 1373 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1374 |
+
return status;
|
| 1375 |
+
}
|
| 1376 |
+
|
| 1377 |
+
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
|
| 1378 |
+
|
| 1379 |
+
// GPUs of compute capability 2.x and higher allocate registers to warps
|
| 1380 |
+
//
|
| 1381 |
+
// Number of regs per warp is regs per thread x warp size, rounded up to
|
| 1382 |
+
// register allocation granularity
|
| 1383 |
+
//
|
| 1384 |
+
regsPerWarp = attributes->numRegs * properties->warpSize;
|
| 1385 |
+
regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
|
| 1386 |
+
regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;
|
| 1387 |
+
|
| 1388 |
+
// Hardware verifies if a launch fits the per-CTA register limit. For
|
| 1389 |
+
// historical reasons, the verification logic assumes register
|
| 1390 |
+
// allocations are made to all partitions simultaneously. Therefore, to
|
| 1391 |
+
// simulate the hardware check, the warp allocation needs to be rounded
|
| 1392 |
+
// up to the number of partitions.
|
| 1393 |
+
//
|
| 1394 |
+
regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
|
| 1395 |
+
|
| 1396 |
+
if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check
|
| 1397 |
+
properties->regsPerBlock < regsAllocatedPerCTA || // Software check
|
| 1398 |
+
attributes->numRegs > maxRegsPerThread) { // Per thread limit check
|
| 1399 |
+
maxBlocks = 0;
|
| 1400 |
+
}
|
| 1401 |
+
else {
|
| 1402 |
+
if (regsAllocatedPerWarp > 0) {
|
| 1403 |
+
// Registers are allocated in each sub-partition. The max number
|
| 1404 |
+
// of warps that can fit on an SM is equal to the max number of
|
| 1405 |
+
// warps per sub-partition x number of sub-partitions.
|
| 1406 |
+
//
|
| 1407 |
+
numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions;
|
| 1408 |
+
numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
|
| 1409 |
+
|
| 1410 |
+
maxBlocks = 0;
|
| 1411 |
+
|
| 1412 |
+
if (*gcConfig != PARTITIONED_GC_OFF) {
|
| 1413 |
+
int numSubPartitionsPerSmPartition;
|
| 1414 |
+
int numWarpsPerSmPartition;
|
| 1415 |
+
int maxBlocksPerSmPartition;
|
| 1416 |
+
|
| 1417 |
+
// If partitioned global caching is on, then a CTA can only
|
| 1418 |
+
// use a half SM, and thus a half of the registers available
|
| 1419 |
+
// per SM
|
| 1420 |
+
//
|
| 1421 |
+
numSubPartitionsPerSmPartition = numSubPartitions / 2;
|
| 1422 |
+
numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
|
| 1423 |
+
maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
|
| 1424 |
+
maxBlocks = maxBlocksPerSmPartition * 2;
|
| 1425 |
+
}
|
| 1426 |
+
|
| 1427 |
+
// Try again if partitioned global caching is not enabled, or if
|
| 1428 |
+
// the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter
|
| 1429 |
+
// case, the device will automatically turn off caching, except
|
| 1430 |
+
// if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
|
| 1431 |
+
// occupancy and launch configuration.
|
| 1432 |
+
//
|
| 1433 |
+
if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
|
| 1434 |
+
// In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
|
| 1435 |
+
// this is what it will be if we spread CTA across partitions.
|
| 1436 |
+
//
|
| 1437 |
+
*gcConfig = PARTITIONED_GC_OFF;
|
| 1438 |
+
numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
|
| 1439 |
+
maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
|
| 1440 |
+
}
|
| 1441 |
+
}
|
| 1442 |
+
else {
|
| 1443 |
+
maxBlocks = INT_MAX;
|
| 1444 |
+
}
|
| 1445 |
+
}
|
| 1446 |
+
|
| 1447 |
+
|
| 1448 |
+
result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
|
| 1449 |
+
|
| 1450 |
+
*limit = maxBlocks;
|
| 1451 |
+
|
| 1452 |
+
return status;
|
| 1453 |
+
}
|
| 1454 |
+
|
| 1455 |
+
// Barrier limit
|
| 1456 |
+
//
|
| 1457 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
|
| 1458 |
+
int *limit,
|
| 1459 |
+
int ctaLimitBlocks,
|
| 1460 |
+
const cudaOccFuncAttributes *attributes)
|
| 1461 |
+
{
|
| 1462 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1463 |
+
int numBarriersAvailable = ctaLimitBlocks * 2;
|
| 1464 |
+
int numBarriersUsed = attributes->numBlockBarriers;
|
| 1465 |
+
int maxBlocks = INT_MAX;
|
| 1466 |
+
|
| 1467 |
+
if (numBarriersUsed) {
|
| 1468 |
+
maxBlocks = numBarriersAvailable / numBarriersUsed;
|
| 1469 |
+
}
|
| 1470 |
+
|
| 1471 |
+
*limit = maxBlocks;
|
| 1472 |
+
|
| 1473 |
+
return status;
|
| 1474 |
+
}
|
| 1475 |
+
|
| 1476 |
+
///////////////////////////////////
|
| 1477 |
+
// API Implementations //
|
| 1478 |
+
///////////////////////////////////
|
| 1479 |
+
|
| 1480 |
+
static __OCC_INLINE
|
| 1481 |
+
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1482 |
+
cudaOccResult *result,
|
| 1483 |
+
const cudaOccDeviceProp *properties,
|
| 1484 |
+
const cudaOccFuncAttributes *attributes,
|
| 1485 |
+
const cudaOccDeviceState *state,
|
| 1486 |
+
int blockSize,
|
| 1487 |
+
size_t dynamicSmemSize)
|
| 1488 |
+
{
|
| 1489 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1490 |
+
int ctaLimitWarps = 0;
|
| 1491 |
+
int ctaLimitBlocks = 0;
|
| 1492 |
+
int ctaLimitSMem = 0;
|
| 1493 |
+
int ctaLimitRegs = 0;
|
| 1494 |
+
int ctaLimitBars = 0;
|
| 1495 |
+
int ctaLimit = 0;
|
| 1496 |
+
unsigned int limitingFactors = 0;
|
| 1497 |
+
|
| 1498 |
+
cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
|
| 1499 |
+
|
| 1500 |
+
if (!result || !properties || !attributes || !state || blockSize <= 0) {
|
| 1501 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1502 |
+
}
|
| 1503 |
+
|
| 1504 |
+
///////////////////////////
|
| 1505 |
+
// Check user input
|
| 1506 |
+
///////////////////////////
|
| 1507 |
+
|
| 1508 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1509 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1510 |
+
return status;
|
| 1511 |
+
}
|
| 1512 |
+
|
| 1513 |
+
///////////////////////////
|
| 1514 |
+
// Initialization
|
| 1515 |
+
///////////////////////////
|
| 1516 |
+
|
| 1517 |
+
gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
|
| 1518 |
+
|
| 1519 |
+
///////////////////////////
|
| 1520 |
+
// Compute occupancy
|
| 1521 |
+
///////////////////////////
|
| 1522 |
+
|
| 1523 |
+
// Limits due to registers/SM
|
| 1524 |
+
// Also compute if partitioned global caching has to be turned off
|
| 1525 |
+
//
|
| 1526 |
+
status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
|
| 1527 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1528 |
+
return status;
|
| 1529 |
+
}
|
| 1530 |
+
|
| 1531 |
+
// SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
|
| 1532 |
+
// As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
|
| 1533 |
+
// For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
|
| 1534 |
+
// we do not let it run on any Pascal processor, even though it may be able to run on GP100.
|
| 1535 |
+
// Therefore, we check the occupancy on GP10x when it can run on GP100
|
| 1536 |
+
//
|
| 1537 |
+
if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
|
| 1538 |
+
cudaOccDeviceProp propertiesGP10x;
|
| 1539 |
+
cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
|
| 1540 |
+
int ctaLimitRegsGP10x = 0;
|
| 1541 |
+
|
| 1542 |
+
// Set up properties for GP10x
|
| 1543 |
+
memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
|
| 1544 |
+
propertiesGP10x.computeMinor = 1;
|
| 1545 |
+
|
| 1546 |
+
status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
|
| 1547 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1548 |
+
return status;
|
| 1549 |
+
}
|
| 1550 |
+
|
| 1551 |
+
if (ctaLimitRegsGP10x == 0) {
|
| 1552 |
+
ctaLimitRegs = 0;
|
| 1553 |
+
}
|
| 1554 |
+
}
|
| 1555 |
+
|
| 1556 |
+
// Limits due to warps/SM
|
| 1557 |
+
//
|
| 1558 |
+
status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
|
| 1559 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1560 |
+
return status;
|
| 1561 |
+
}
|
| 1562 |
+
|
| 1563 |
+
// Limits due to blocks/SM
|
| 1564 |
+
//
|
| 1565 |
+
status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
|
| 1566 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1567 |
+
return status;
|
| 1568 |
+
}
|
| 1569 |
+
|
| 1570 |
+
// Limits due to shared memory/SM
|
| 1571 |
+
//
|
| 1572 |
+
status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
|
| 1573 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1574 |
+
return status;
|
| 1575 |
+
}
|
| 1576 |
+
|
| 1577 |
+
///////////////////////////
|
| 1578 |
+
// Overall occupancy
|
| 1579 |
+
///////////////////////////
|
| 1580 |
+
|
| 1581 |
+
// Overall limit is min() of limits due to above reasons
|
| 1582 |
+
//
|
| 1583 |
+
ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
|
| 1584 |
+
|
| 1585 |
+
// Determine occupancy limiting factors
|
| 1586 |
+
//
|
| 1587 |
+
if (ctaLimit == ctaLimitWarps) {
|
| 1588 |
+
limitingFactors |= OCC_LIMIT_WARPS;
|
| 1589 |
+
}
|
| 1590 |
+
if (ctaLimit == ctaLimitRegs) {
|
| 1591 |
+
limitingFactors |= OCC_LIMIT_REGISTERS;
|
| 1592 |
+
}
|
| 1593 |
+
if (ctaLimit == ctaLimitSMem) {
|
| 1594 |
+
limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
|
| 1595 |
+
}
|
| 1596 |
+
if (ctaLimit == ctaLimitBlocks) {
|
| 1597 |
+
limitingFactors |= OCC_LIMIT_BLOCKS;
|
| 1598 |
+
}
|
| 1599 |
+
|
| 1600 |
+
// For Hopper onwards compute the limits to occupancy based on block barrier count
|
| 1601 |
+
//
|
| 1602 |
+
if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
|
| 1603 |
+
// Limits due to barrier/SM
|
| 1604 |
+
//
|
| 1605 |
+
status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
|
| 1606 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1607 |
+
return status;
|
| 1608 |
+
}
|
| 1609 |
+
|
| 1610 |
+
// Recompute overall limit based on barrier/SM
|
| 1611 |
+
//
|
| 1612 |
+
ctaLimit = __occMin(ctaLimitBars, ctaLimit);
|
| 1613 |
+
|
| 1614 |
+
// Determine if this is occupancy limiting factor
|
| 1615 |
+
//
|
| 1616 |
+
if (ctaLimit == ctaLimitBars) {
|
| 1617 |
+
limitingFactors |= OCC_LIMIT_BARRIERS;
|
| 1618 |
+
}
|
| 1619 |
+
}
|
| 1620 |
+
else {
|
| 1621 |
+
ctaLimitBars = INT_MAX;
|
| 1622 |
+
}
|
| 1623 |
+
|
| 1624 |
+
// Fill in the return values
|
| 1625 |
+
//
|
| 1626 |
+
result->limitingFactors = limitingFactors;
|
| 1627 |
+
|
| 1628 |
+
result->blockLimitRegs = ctaLimitRegs;
|
| 1629 |
+
result->blockLimitSharedMem = ctaLimitSMem;
|
| 1630 |
+
result->blockLimitWarps = ctaLimitWarps;
|
| 1631 |
+
result->blockLimitBlocks = ctaLimitBlocks;
|
| 1632 |
+
result->blockLimitBarriers = ctaLimitBars;
|
| 1633 |
+
result->partitionedGCConfig = gcConfig;
|
| 1634 |
+
|
| 1635 |
+
// Final occupancy
|
| 1636 |
+
result->activeBlocksPerMultiprocessor = ctaLimit;
|
| 1637 |
+
|
| 1638 |
+
return CUDA_OCC_SUCCESS;
|
| 1639 |
+
}
|
| 1640 |
+
|
| 1641 |
+
static __OCC_INLINE
|
| 1642 |
+
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
|
| 1643 |
+
size_t *bytesAvailable,
|
| 1644 |
+
const cudaOccDeviceProp *properties,
|
| 1645 |
+
const cudaOccFuncAttributes *attributes,
|
| 1646 |
+
const cudaOccDeviceState *state,
|
| 1647 |
+
int numBlocks,
|
| 1648 |
+
int blockSize)
|
| 1649 |
+
{
|
| 1650 |
+
int allocationGranularity;
|
| 1651 |
+
size_t smemLimitPerBlock;
|
| 1652 |
+
size_t smemAvailableForDynamic;
|
| 1653 |
+
size_t userSmemPreference = 0;
|
| 1654 |
+
size_t sharedMemPerMultiprocessor;
|
| 1655 |
+
cudaOccResult result;
|
| 1656 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1657 |
+
|
| 1658 |
+
if (numBlocks <= 0)
|
| 1659 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1660 |
+
|
| 1661 |
+
// First compute occupancy of potential kernel launch.
|
| 1662 |
+
//
|
| 1663 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
|
| 1664 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1665 |
+
return status;
|
| 1666 |
+
}
|
| 1667 |
+
// Check if occupancy is achievable given user requested number of blocks.
|
| 1668 |
+
//
|
| 1669 |
+
if (result.activeBlocksPerMultiprocessor < numBlocks) {
|
| 1670 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1671 |
+
}
|
| 1672 |
+
|
| 1673 |
+
status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
|
| 1674 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1675 |
+
return status;
|
| 1676 |
+
}
|
| 1677 |
+
|
| 1678 |
+
// Return the per block shared memory limit based on function config.
|
| 1679 |
+
//
|
| 1680 |
+
status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
|
| 1681 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1682 |
+
return status;
|
| 1683 |
+
}
|
| 1684 |
+
|
| 1685 |
+
// If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
|
| 1686 |
+
// limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
|
| 1687 |
+
// preference sets the total limit of available shared memory.
|
| 1688 |
+
//
|
| 1689 |
+
cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
|
| 1690 |
+
if (numBlocks == 1) {
|
| 1691 |
+
sharedMemPerMultiprocessor = smemLimitPerBlock;
|
| 1692 |
+
}
|
| 1693 |
+
else {
|
| 1694 |
+
if (!userSmemPreference) {
|
| 1695 |
+
userSmemPreference = 1 ;
|
| 1696 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
|
| 1697 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1698 |
+
return status;
|
| 1699 |
+
}
|
| 1700 |
+
}
|
| 1701 |
+
sharedMemPerMultiprocessor = userSmemPreference;
|
| 1702 |
+
}
|
| 1703 |
+
|
| 1704 |
+
// Compute total shared memory available per SM
|
| 1705 |
+
//
|
| 1706 |
+
smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks;
|
| 1707 |
+
smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
|
| 1708 |
+
|
| 1709 |
+
// Cap shared memory
|
| 1710 |
+
//
|
| 1711 |
+
if (smemAvailableForDynamic > smemLimitPerBlock) {
|
| 1712 |
+
smemAvailableForDynamic = smemLimitPerBlock;
|
| 1713 |
+
}
|
| 1714 |
+
|
| 1715 |
+
// Now compute dynamic shared memory size
|
| 1716 |
+
smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
|
| 1717 |
+
|
| 1718 |
+
// Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
|
| 1719 |
+
//
|
| 1720 |
+
if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
|
| 1721 |
+
smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
|
| 1722 |
+
|
| 1723 |
+
*bytesAvailable = smemAvailableForDynamic;
|
| 1724 |
+
return CUDA_OCC_SUCCESS;
|
| 1725 |
+
}
|
| 1726 |
+
|
| 1727 |
+
static __OCC_INLINE
|
| 1728 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 1729 |
+
int *minGridSize,
|
| 1730 |
+
int *blockSize,
|
| 1731 |
+
const cudaOccDeviceProp *properties,
|
| 1732 |
+
const cudaOccFuncAttributes *attributes,
|
| 1733 |
+
const cudaOccDeviceState *state,
|
| 1734 |
+
size_t (*blockSizeToDynamicSMemSize)(int),
|
| 1735 |
+
size_t dynamicSMemSize)
|
| 1736 |
+
{
|
| 1737 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1738 |
+
cudaOccResult result;
|
| 1739 |
+
|
| 1740 |
+
// Limits
|
| 1741 |
+
int occupancyLimit;
|
| 1742 |
+
int granularity;
|
| 1743 |
+
int blockSizeLimit;
|
| 1744 |
+
|
| 1745 |
+
// Recorded maximum
|
| 1746 |
+
int maxBlockSize = 0;
|
| 1747 |
+
int numBlocks = 0;
|
| 1748 |
+
int maxOccupancy = 0;
|
| 1749 |
+
|
| 1750 |
+
// Temporary
|
| 1751 |
+
int blockSizeToTryAligned;
|
| 1752 |
+
int blockSizeToTry;
|
| 1753 |
+
int blockSizeLimitAligned;
|
| 1754 |
+
int occupancyInBlocks;
|
| 1755 |
+
int occupancyInThreads;
|
| 1756 |
+
|
| 1757 |
+
///////////////////////////
|
| 1758 |
+
// Check user input
|
| 1759 |
+
///////////////////////////
|
| 1760 |
+
|
| 1761 |
+
if (!minGridSize || !blockSize || !properties || !attributes || !state) {
|
| 1762 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1763 |
+
}
|
| 1764 |
+
|
| 1765 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1766 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1767 |
+
return status;
|
| 1768 |
+
}
|
| 1769 |
+
|
| 1770 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1771 |
+
// Try each block size, and pick the block size with maximum occupancy
|
| 1772 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1773 |
+
|
| 1774 |
+
occupancyLimit = properties->maxThreadsPerMultiprocessor;
|
| 1775 |
+
granularity = properties->warpSize;
|
| 1776 |
+
|
| 1777 |
+
blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
|
| 1778 |
+
blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
|
| 1779 |
+
|
| 1780 |
+
for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
|
| 1781 |
+
blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
|
| 1782 |
+
|
| 1783 |
+
// Ignore dynamicSMemSize if the user provides a mapping
|
| 1784 |
+
//
|
| 1785 |
+
if (blockSizeToDynamicSMemSize) {
|
| 1786 |
+
dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
|
| 1787 |
+
}
|
| 1788 |
+
|
| 1789 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1790 |
+
&result,
|
| 1791 |
+
properties,
|
| 1792 |
+
attributes,
|
| 1793 |
+
state,
|
| 1794 |
+
blockSizeToTry,
|
| 1795 |
+
dynamicSMemSize);
|
| 1796 |
+
|
| 1797 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1798 |
+
return status;
|
| 1799 |
+
}
|
| 1800 |
+
|
| 1801 |
+
occupancyInBlocks = result.activeBlocksPerMultiprocessor;
|
| 1802 |
+
occupancyInThreads = blockSizeToTry * occupancyInBlocks;
|
| 1803 |
+
|
| 1804 |
+
if (occupancyInThreads > maxOccupancy) {
|
| 1805 |
+
maxBlockSize = blockSizeToTry;
|
| 1806 |
+
numBlocks = occupancyInBlocks;
|
| 1807 |
+
maxOccupancy = occupancyInThreads;
|
| 1808 |
+
}
|
| 1809 |
+
|
| 1810 |
+
// Early out if we have reached the maximum
|
| 1811 |
+
//
|
| 1812 |
+
if (occupancyLimit == maxOccupancy) {
|
| 1813 |
+
break;
|
| 1814 |
+
}
|
| 1815 |
+
}
|
| 1816 |
+
|
| 1817 |
+
///////////////////////////
|
| 1818 |
+
// Return best available
|
| 1819 |
+
///////////////////////////
|
| 1820 |
+
|
| 1821 |
+
// Suggested min grid size to achieve a full machine launch
|
| 1822 |
+
//
|
| 1823 |
+
*minGridSize = numBlocks * properties->numSms;
|
| 1824 |
+
*blockSize = maxBlockSize;
|
| 1825 |
+
|
| 1826 |
+
return status;
|
| 1827 |
+
}
|
| 1828 |
+
|
| 1829 |
+
|
| 1830 |
+
#if defined(__cplusplus)
|
| 1831 |
+
|
| 1832 |
+
namespace {
|
| 1833 |
+
|
| 1834 |
+
__OCC_INLINE
|
| 1835 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 1836 |
+
int *minGridSize,
|
| 1837 |
+
int *blockSize,
|
| 1838 |
+
const cudaOccDeviceProp *properties,
|
| 1839 |
+
const cudaOccFuncAttributes *attributes,
|
| 1840 |
+
const cudaOccDeviceState *state,
|
| 1841 |
+
size_t dynamicSMemSize)
|
| 1842 |
+
{
|
| 1843 |
+
return cudaOccMaxPotentialOccupancyBlockSize(
|
| 1844 |
+
minGridSize,
|
| 1845 |
+
blockSize,
|
| 1846 |
+
properties,
|
| 1847 |
+
attributes,
|
| 1848 |
+
state,
|
| 1849 |
+
NULL,
|
| 1850 |
+
dynamicSMemSize);
|
| 1851 |
+
}
|
| 1852 |
+
|
| 1853 |
+
template <typename UnaryFunction>
|
| 1854 |
+
__OCC_INLINE
|
| 1855 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
|
| 1856 |
+
int *minGridSize,
|
| 1857 |
+
int *blockSize,
|
| 1858 |
+
const cudaOccDeviceProp *properties,
|
| 1859 |
+
const cudaOccFuncAttributes *attributes,
|
| 1860 |
+
const cudaOccDeviceState *state,
|
| 1861 |
+
UnaryFunction blockSizeToDynamicSMemSize)
|
| 1862 |
+
{
|
| 1863 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1864 |
+
cudaOccResult result;
|
| 1865 |
+
|
| 1866 |
+
// Limits
|
| 1867 |
+
int occupancyLimit;
|
| 1868 |
+
int granularity;
|
| 1869 |
+
int blockSizeLimit;
|
| 1870 |
+
|
| 1871 |
+
// Recorded maximum
|
| 1872 |
+
int maxBlockSize = 0;
|
| 1873 |
+
int numBlocks = 0;
|
| 1874 |
+
int maxOccupancy = 0;
|
| 1875 |
+
|
| 1876 |
+
// Temporary
|
| 1877 |
+
int blockSizeToTryAligned;
|
| 1878 |
+
int blockSizeToTry;
|
| 1879 |
+
int blockSizeLimitAligned;
|
| 1880 |
+
int occupancyInBlocks;
|
| 1881 |
+
int occupancyInThreads;
|
| 1882 |
+
size_t dynamicSMemSize;
|
| 1883 |
+
|
| 1884 |
+
///////////////////////////
|
| 1885 |
+
// Check user input
|
| 1886 |
+
///////////////////////////
|
| 1887 |
+
|
| 1888 |
+
if (!minGridSize || !blockSize || !properties || !attributes || !state) {
|
| 1889 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1890 |
+
}
|
| 1891 |
+
|
| 1892 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1893 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1894 |
+
return status;
|
| 1895 |
+
}
|
| 1896 |
+
|
| 1897 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1898 |
+
// Try each block size, and pick the block size with maximum occupancy
|
| 1899 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1900 |
+
|
| 1901 |
+
occupancyLimit = properties->maxThreadsPerMultiprocessor;
|
| 1902 |
+
granularity = properties->warpSize;
|
| 1903 |
+
blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
|
| 1904 |
+
blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
|
| 1905 |
+
|
| 1906 |
+
for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
|
| 1907 |
+
blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
|
| 1908 |
+
|
| 1909 |
+
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
|
| 1910 |
+
|
| 1911 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1912 |
+
&result,
|
| 1913 |
+
properties,
|
| 1914 |
+
attributes,
|
| 1915 |
+
state,
|
| 1916 |
+
blockSizeToTry,
|
| 1917 |
+
dynamicSMemSize);
|
| 1918 |
+
|
| 1919 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1920 |
+
return status;
|
| 1921 |
+
}
|
| 1922 |
+
|
| 1923 |
+
occupancyInBlocks = result.activeBlocksPerMultiprocessor;
|
| 1924 |
+
|
| 1925 |
+
occupancyInThreads = blockSizeToTry * occupancyInBlocks;
|
| 1926 |
+
|
| 1927 |
+
if (occupancyInThreads > maxOccupancy) {
|
| 1928 |
+
maxBlockSize = blockSizeToTry;
|
| 1929 |
+
numBlocks = occupancyInBlocks;
|
| 1930 |
+
maxOccupancy = occupancyInThreads;
|
| 1931 |
+
}
|
| 1932 |
+
|
| 1933 |
+
// Early out if we have reached the maximum
|
| 1934 |
+
//
|
| 1935 |
+
if (occupancyLimit == maxOccupancy) {
|
| 1936 |
+
break;
|
| 1937 |
+
}
|
| 1938 |
+
}
|
| 1939 |
+
|
| 1940 |
+
///////////////////////////
|
| 1941 |
+
// Return best available
|
| 1942 |
+
///////////////////////////
|
| 1943 |
+
|
| 1944 |
+
// Suggested min grid size to achieve a full machine launch
|
| 1945 |
+
//
|
| 1946 |
+
*minGridSize = numBlocks * properties->numSms;
|
| 1947 |
+
*blockSize = maxBlockSize;
|
| 1948 |
+
|
| 1949 |
+
return status;
|
| 1950 |
+
}
|
| 1951 |
+
|
| 1952 |
+
} // namespace anonymous
|
| 1953 |
+
|
| 1954 |
+
#endif /*__cplusplus */
|
| 1955 |
+
|
| 1956 |
+
#undef __OCC_INLINE
|
| 1957 |
+
|
| 1958 |
+
#endif /*__cuda_occupancy_h__*/
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_H_
|
| 51 |
+
# define _CUDA_PIPELINE_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_pipeline_primitives.h"
|
| 54 |
+
|
| 55 |
+
# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
|
| 56 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 57 |
+
-std=c++11 compiler option.
|
| 58 |
+
# endif
|
| 59 |
+
|
| 60 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 61 |
+
# include "cuda_awbarrier.h"
|
| 62 |
+
# endif
|
| 63 |
+
|
| 64 |
+
// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
|
| 65 |
+
|
| 66 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 67 |
+
# if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
|
| 68 |
+
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
|
| 69 |
+
# else
|
| 70 |
+
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
|
| 71 |
+
# endif
|
| 72 |
+
|
| 73 |
+
# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
|
| 74 |
+
# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
|
| 75 |
+
# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
|
| 76 |
+
|
| 77 |
+
namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
|
| 78 |
+
struct __block_scope_barrier_base;
|
| 79 |
+
}}
|
| 80 |
+
|
| 81 |
+
# endif
|
| 82 |
+
|
| 83 |
+
_CUDA_PIPELINE_BEGIN_NAMESPACE
|
| 84 |
+
|
| 85 |
+
template<size_t N, typename T>
|
| 86 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 87 |
+
auto segment(T* ptr) -> T(*)[N];
|
| 88 |
+
|
| 89 |
+
class pipeline {
|
| 90 |
+
public:
|
| 91 |
+
pipeline(const pipeline&) = delete;
|
| 92 |
+
pipeline(pipeline&&) = delete;
|
| 93 |
+
pipeline& operator=(const pipeline&) = delete;
|
| 94 |
+
pipeline& operator=(pipeline&&) = delete;
|
| 95 |
+
|
| 96 |
+
_CUDA_PIPELINE_QUALIFIER pipeline();
|
| 97 |
+
_CUDA_PIPELINE_QUALIFIER size_t commit();
|
| 98 |
+
_CUDA_PIPELINE_QUALIFIER void commit_and_wait();
|
| 99 |
+
_CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
|
| 100 |
+
template<unsigned N>
|
| 101 |
+
_CUDA_PIPELINE_QUALIFIER void wait_prior();
|
| 102 |
+
|
| 103 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 104 |
+
_CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
|
| 105 |
+
_CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
|
| 106 |
+
# endif
|
| 107 |
+
|
| 108 |
+
private:
|
| 109 |
+
size_t current_batch;
|
| 110 |
+
};
|
| 111 |
+
|
| 112 |
+
template<class T>
|
| 113 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 114 |
+
void memcpy_async(T& dst, const T& src, pipeline& pipe);
|
| 115 |
+
|
| 116 |
+
template<class T, size_t DstN, size_t SrcN>
|
| 117 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 118 |
+
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
|
| 119 |
+
|
| 120 |
+
template<size_t N, typename T>
|
| 121 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 122 |
+
auto segment(T* ptr) -> T(*)[N]
|
| 123 |
+
{
|
| 124 |
+
return (T(*)[N])ptr;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 128 |
+
pipeline::pipeline()
|
| 129 |
+
: current_batch(0)
|
| 130 |
+
{
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 134 |
+
size_t pipeline::commit()
|
| 135 |
+
{
|
| 136 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
|
| 137 |
+
return this->current_batch++;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 141 |
+
void pipeline::commit_and_wait()
|
| 142 |
+
{
|
| 143 |
+
(void)pipeline::commit();
|
| 144 |
+
pipeline::wait_prior<0>();
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 148 |
+
void pipeline::wait(size_t batch)
|
| 149 |
+
{
|
| 150 |
+
const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
|
| 151 |
+
|
| 152 |
+
switch (prior) {
|
| 153 |
+
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
|
| 154 |
+
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
|
| 155 |
+
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
|
| 156 |
+
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
|
| 157 |
+
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
|
| 158 |
+
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
|
| 159 |
+
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
|
| 160 |
+
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
|
| 161 |
+
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
template<unsigned N>
|
| 166 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 167 |
+
void pipeline::wait_prior()
|
| 168 |
+
{
|
| 169 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 173 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 174 |
+
void pipeline::arrive_on(awbarrier& barrier)
|
| 175 |
+
{
|
| 176 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 180 |
+
void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
|
| 181 |
+
{
|
| 182 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
|
| 183 |
+
}
|
| 184 |
+
# endif
|
| 185 |
+
|
| 186 |
+
template<class T>
|
| 187 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 188 |
+
void memcpy_async(T& dst, const T& src, pipeline& pipe)
|
| 189 |
+
{
|
| 190 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
|
| 191 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
|
| 192 |
+
|
| 193 |
+
if (__is_trivially_copyable(T)) {
|
| 194 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
|
| 195 |
+
reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
|
| 196 |
+
} else {
|
| 197 |
+
dst = src;
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
template<class T, size_t DstN, size_t SrcN>
|
| 202 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 203 |
+
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
|
| 204 |
+
{
|
| 205 |
+
constexpr size_t dst_size = sizeof(*dst);
|
| 206 |
+
constexpr size_t src_size = sizeof(*src);
|
| 207 |
+
static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
|
| 208 |
+
static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
|
| 209 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
|
| 210 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
|
| 211 |
+
|
| 212 |
+
if (__is_trivially_copyable(T)) {
|
| 213 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
|
| 214 |
+
reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
|
| 215 |
+
} else {
|
| 216 |
+
for (size_t i = 0; i < DstN; ++i) {
|
| 217 |
+
(*dst)[i] = (i < SrcN) ? (*src)[i] : T();
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CUDA_PIPELINE_END_NAMESPACE
|
| 223 |
+
|
| 224 |
+
#endif /* !_CUDA_PIPELINE_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_HELPERS_H_
|
| 51 |
+
# define _CUDA_PIPELINE_HELPERS_H_
|
| 52 |
+
|
| 53 |
+
# define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental
|
| 54 |
+
# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
|
| 55 |
+
# define _CUDA_PIPELINE_END_NAMESPACE } }
|
| 56 |
+
|
| 57 |
+
# define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
|
| 58 |
+
# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
|
| 59 |
+
# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE
|
| 60 |
+
|
| 61 |
+
# if !defined(_CUDA_PIPELINE_QUALIFIER)
|
| 62 |
+
# define _CUDA_PIPELINE_QUALIFIER inline __device__
|
| 63 |
+
# endif
|
| 64 |
+
# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
|
| 65 |
+
# define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
|
| 66 |
+
# endif
|
| 67 |
+
|
| 68 |
+
# if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 69 |
+
# define _CUDA_PIPELINE_ARCH_700_OR_LATER
|
| 70 |
+
# endif
|
| 71 |
+
|
| 72 |
+
# if (__CUDA_ARCH__ >= 800)
|
| 73 |
+
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
|
| 74 |
+
# else
|
| 75 |
+
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
|
| 76 |
+
# endif
|
| 77 |
+
|
| 78 |
+
# if !defined(_CUDA_PIPELINE_MAX_STAGES)
|
| 79 |
+
# define _CUDA_PIPELINE_MAX_STAGES 8
|
| 80 |
+
# endif
|
| 81 |
+
|
| 82 |
+
# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
|
| 83 |
+
# define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
|
| 84 |
+
# endif
|
| 85 |
+
|
| 86 |
+
# if !defined(_CUDA_PIPELINE_DEBUG)
|
| 87 |
+
# if defined(__CUDACC_DEBUG__)
|
| 88 |
+
# define _CUDA_PIPELINE_DEBUG 1
|
| 89 |
+
# else
|
| 90 |
+
# define _CUDA_PIPELINE_DEBUG 0
|
| 91 |
+
# endif
|
| 92 |
+
# endif
|
| 93 |
+
|
| 94 |
+
# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
|
| 95 |
+
# if !defined(__CUDACC_RTC__)
|
| 96 |
+
# include <cassert>
|
| 97 |
+
# endif
|
| 98 |
+
# define _CUDA_PIPELINE_ASSERT(x) assert((x));
|
| 99 |
+
# define _CUDA_PIPELINE_ABORT() assert(0);
|
| 100 |
+
# else
|
| 101 |
+
# define _CUDA_PIPELINE_ASSERT(x)
|
| 102 |
+
# define _CUDA_PIPELINE_ABORT() __trap();
|
| 103 |
+
# endif
|
| 104 |
+
|
| 105 |
+
# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
|
| 106 |
+
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
|
| 107 |
+
# else
|
| 108 |
+
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
|
| 109 |
+
# endif
|
| 110 |
+
|
| 111 |
+
# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
|
| 112 |
+
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
|
| 113 |
+
# else
|
| 114 |
+
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
|
| 115 |
+
# endif
|
| 116 |
+
|
| 117 |
+
# if defined(__CUDACC_RTC__)
|
| 118 |
+
typedef unsigned int uint32_t;
|
| 119 |
+
typedef unsigned long long uint64_t;
|
| 120 |
+
typedef uint64_t uintptr_t;
|
| 121 |
+
# else
|
| 122 |
+
# include <stdint.h>
|
| 123 |
+
# endif
|
| 124 |
+
|
| 125 |
+
_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
|
| 126 |
+
|
| 127 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'");
|
| 128 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'");
|
| 129 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'");
|
| 130 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'");
|
| 131 |
+
|
| 132 |
+
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
|
| 133 |
+
|
| 134 |
+
template<size_t CopySize, size_t SourceSize>
|
| 135 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 136 |
+
void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
|
| 137 |
+
{
|
| 138 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 139 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 140 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 141 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 142 |
+
|
| 143 |
+
char* const d = reinterpret_cast<char*>(dst);
|
| 144 |
+
const char* const s = reinterpret_cast<const char*>(src);
|
| 145 |
+
|
| 146 |
+
size_t copy_step_size;
|
| 147 |
+
if (SourceSize == 0) {
|
| 148 |
+
copy_step_size = CopySize;
|
| 149 |
+
} else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
|
| 150 |
+
copy_step_size = SourceSize;
|
| 151 |
+
} else {
|
| 152 |
+
copy_step_size = 1;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
for (size_t i = 0; i < CopySize; i += copy_step_size) {
|
| 156 |
+
const bool copy_source = SourceSize && (i < SourceSize);
|
| 157 |
+
|
| 158 |
+
switch (copy_step_size) {
|
| 159 |
+
case 1:
|
| 160 |
+
d[i] = copy_source ? s[i] : char();
|
| 161 |
+
break;
|
| 162 |
+
case 2:
|
| 163 |
+
*reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
|
| 164 |
+
break;
|
| 165 |
+
case 4:
|
| 166 |
+
*reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
|
| 167 |
+
break;
|
| 168 |
+
case 8:
|
| 169 |
+
*reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
|
| 170 |
+
break;
|
| 171 |
+
case 16:
|
| 172 |
+
*reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
|
| 173 |
+
break;
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
template<bool UseHwAsyncCopy>
|
| 179 |
+
struct ImplementationChooser;
|
| 180 |
+
|
| 181 |
+
template<>
|
| 182 |
+
struct ImplementationChooser<true> {
|
| 183 |
+
template<size_t CopySize, size_t SourceSize>
|
| 184 |
+
struct CpAsyncChooser {
|
| 185 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 186 |
+
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 187 |
+
{
|
| 188 |
+
asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
|
| 189 |
+
:
|
| 190 |
+
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
|
| 191 |
+
"n"(SourceSize)
|
| 192 |
+
: "memory");
|
| 193 |
+
}
|
| 194 |
+
};
|
| 195 |
+
|
| 196 |
+
template<size_t SourceSize>
|
| 197 |
+
struct CpAsyncChooser<16, SourceSize> {
|
| 198 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 199 |
+
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 200 |
+
{
|
| 201 |
+
asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
|
| 202 |
+
:
|
| 203 |
+
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
|
| 204 |
+
: "memory");
|
| 205 |
+
}
|
| 206 |
+
};
|
| 207 |
+
|
| 208 |
+
template<size_t CopySize, size_t SourceSize>
|
| 209 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 210 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 211 |
+
{
|
| 212 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 213 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 214 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 215 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 216 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 217 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 218 |
+
|
| 219 |
+
CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 223 |
+
void pipeline_commit()
|
| 224 |
+
{
|
| 225 |
+
asm volatile ("cp.async.commit_group;");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
template<unsigned N>
|
| 229 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 230 |
+
void pipeline_wait_prior()
|
| 231 |
+
{
|
| 232 |
+
asm volatile ("cp.async.wait_group %0;"
|
| 233 |
+
:
|
| 234 |
+
: "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 238 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 239 |
+
{
|
| 240 |
+
_CUDA_PIPELINE_ASSERT(__isShared(barrier));
|
| 241 |
+
|
| 242 |
+
asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
|
| 243 |
+
:
|
| 244 |
+
: "r"(__nvvm_get_smem_pointer(barrier)));
|
| 245 |
+
}
|
| 246 |
+
};
|
| 247 |
+
|
| 248 |
+
template<>
|
| 249 |
+
struct ImplementationChooser<false> {
|
| 250 |
+
template<size_t CopySize, size_t SourceSize>
|
| 251 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 252 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 253 |
+
{
|
| 254 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 255 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 256 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 257 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 258 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 259 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 260 |
+
|
| 261 |
+
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 265 |
+
void pipeline_commit()
|
| 266 |
+
{
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
template<unsigned N>
|
| 270 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 271 |
+
void pipeline_wait_prior()
|
| 272 |
+
{
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 276 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 277 |
+
{
|
| 278 |
+
}
|
| 279 |
+
};
|
| 280 |
+
|
| 281 |
+
template<size_t CopySize, size_t SourceSize>
|
| 282 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 283 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 284 |
+
{
|
| 285 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 286 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 287 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 288 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 289 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 290 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 291 |
+
|
| 292 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 296 |
+
void pipeline_commit()
|
| 297 |
+
{
|
| 298 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<unsigned N>
|
| 302 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 303 |
+
void pipeline_wait_prior()
|
| 304 |
+
{
|
| 305 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 309 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 310 |
+
{
|
| 311 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
template<size_t CopySize, size_t SourceSize>
|
| 315 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 316 |
+
void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
|
| 317 |
+
{
|
| 318 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 319 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
|
| 320 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 321 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 322 |
+
|
| 323 |
+
if (__isGlobal(src) && __isShared(dst)) {
|
| 324 |
+
pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
|
| 325 |
+
} else {
|
| 326 |
+
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
|
| 327 |
+
}
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template<size_t CopySize, size_t Align>
|
| 331 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 332 |
+
void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
|
| 333 |
+
{
|
| 334 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
|
| 335 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
|
| 336 |
+
|
| 337 |
+
const char* s = reinterpret_cast<const char*>(src);
|
| 338 |
+
char* d = reinterpret_cast<char*>(dst);
|
| 339 |
+
size_t remaining = CopySize;
|
| 340 |
+
|
| 341 |
+
while (remaining) {
|
| 342 |
+
if ((Align >= 16) && (remaining >= 16)) {
|
| 343 |
+
pipeline_copy_strict<16, 16>(dst, src);
|
| 344 |
+
d += 16;
|
| 345 |
+
s += 16;
|
| 346 |
+
remaining -= 16;
|
| 347 |
+
} else if ((Align >= 8) && (remaining >= 8)) {
|
| 348 |
+
pipeline_copy_strict<8, 8>(dst, src);
|
| 349 |
+
d += 8;
|
| 350 |
+
s += 8;
|
| 351 |
+
remaining -= 8;
|
| 352 |
+
} else if ((Align >= 4) && (remaining >= 4)) {
|
| 353 |
+
pipeline_copy_strict<4, 4>(dst, src);
|
| 354 |
+
d += 4;
|
| 355 |
+
s += 4;
|
| 356 |
+
remaining -= 4;
|
| 357 |
+
} else if ((Align >= 2) && (remaining >= 2)) {
|
| 358 |
+
*reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
|
| 359 |
+
d += 2;
|
| 360 |
+
s += 2;
|
| 361 |
+
remaining -= 2;
|
| 362 |
+
} else {
|
| 363 |
+
*d = *s;
|
| 364 |
+
d += 1;
|
| 365 |
+
s += 1;
|
| 366 |
+
remaining -= 1;
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
|
| 372 |
+
|
| 373 |
+
#endif /* !_CUDA_PIPELINE_HELPERS_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
|
| 51 |
+
# define _CUDA_PIPELINE_PRIMITIVES_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_pipeline_helpers.h"
|
| 54 |
+
|
| 55 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 56 |
+
void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
|
| 57 |
+
size_t zfill = 0)
|
| 58 |
+
{
|
| 59 |
+
_CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
|
| 60 |
+
_CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
|
| 61 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
|
| 62 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
|
| 63 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
|
| 64 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
|
| 65 |
+
|
| 66 |
+
switch (size_and_align) {
|
| 67 |
+
case 16:
|
| 68 |
+
switch (zfill) {
|
| 69 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
|
| 70 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
|
| 71 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
|
| 72 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
|
| 73 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
|
| 74 |
+
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
|
| 75 |
+
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
|
| 76 |
+
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return;
|
| 77 |
+
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return;
|
| 78 |
+
case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return;
|
| 79 |
+
case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return;
|
| 80 |
+
case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return;
|
| 81 |
+
case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return;
|
| 82 |
+
case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return;
|
| 83 |
+
case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return;
|
| 84 |
+
case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return;
|
| 85 |
+
case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return;
|
| 86 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 87 |
+
}
|
| 88 |
+
case 8:
|
| 89 |
+
switch (zfill) {
|
| 90 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return;
|
| 91 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return;
|
| 92 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return;
|
| 93 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return;
|
| 94 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return;
|
| 95 |
+
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return;
|
| 96 |
+
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return;
|
| 97 |
+
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return;
|
| 98 |
+
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return;
|
| 99 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 100 |
+
}
|
| 101 |
+
case 4:
|
| 102 |
+
switch (zfill) {
|
| 103 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return;
|
| 104 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return;
|
| 105 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return;
|
| 106 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return;
|
| 107 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return;
|
| 108 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 109 |
+
}
|
| 110 |
+
default:
|
| 111 |
+
_CUDA_PIPELINE_ABORT();
|
| 112 |
+
return;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 117 |
+
void __pipeline_commit()
|
| 118 |
+
{
|
| 119 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 123 |
+
void __pipeline_wait_prior(size_t prior)
|
| 124 |
+
{
|
| 125 |
+
switch (prior) {
|
| 126 |
+
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
|
| 127 |
+
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
|
| 128 |
+
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
|
| 129 |
+
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
|
| 130 |
+
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
|
| 131 |
+
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
|
| 132 |
+
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
|
| 133 |
+
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
|
| 134 |
+
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 139 |
+
# include "cuda_awbarrier_primitives.h"
|
| 140 |
+
|
| 141 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 142 |
+
void __pipeline_arrive_on(__mbarrier_t* barrier)
|
| 143 |
+
{
|
| 144 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
|
| 145 |
+
}
|
| 146 |
+
# endif
|
| 147 |
+
|
| 148 |
+
#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h
ADDED
|
@@ -0,0 +1,2374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_RUNTIME_H__)
|
| 51 |
+
#define __CUDA_RUNTIME_H__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 54 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 55 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
|
| 56 |
+
#endif
|
| 57 |
+
|
| 58 |
+
#define EXCLUDE_FROM_RTC
|
| 59 |
+
#if defined(__GNUC__)
|
| 60 |
+
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
|
| 61 |
+
#pragma GCC diagnostic push
|
| 62 |
+
#endif
|
| 63 |
+
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
|
| 64 |
+
#pragma GCC diagnostic ignored "-Wunused-function"
|
| 65 |
+
#endif
|
| 66 |
+
#elif defined(_MSC_VER)
|
| 67 |
+
#pragma warning(push)
|
| 68 |
+
#pragma warning(disable: 4820)
|
| 69 |
+
#endif
|
| 70 |
+
#ifdef __QNX__
|
| 71 |
+
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
|
| 72 |
+
typedef unsigned size_t;
|
| 73 |
+
#endif
|
| 74 |
+
#endif
|
| 75 |
+
#undef EXCLUDE_FROM_RTC
|
| 76 |
+
/*******************************************************************************
|
| 77 |
+
* *
|
| 78 |
+
* *
|
| 79 |
+
* *
|
| 80 |
+
*******************************************************************************/
|
| 81 |
+
|
| 82 |
+
#include "crt/host_config.h"
|
| 83 |
+
|
| 84 |
+
/*******************************************************************************
|
| 85 |
+
* *
|
| 86 |
+
* *
|
| 87 |
+
* *
|
| 88 |
+
*******************************************************************************/
|
| 89 |
+
|
| 90 |
+
#include "builtin_types.h"
|
| 91 |
+
#include "library_types.h"
|
| 92 |
+
#if !defined(__CUDACC_RTC__)
|
| 93 |
+
#define EXCLUDE_FROM_RTC
|
| 94 |
+
#include "channel_descriptor.h"
|
| 95 |
+
#include "cuda_runtime_api.h"
|
| 96 |
+
#include "driver_functions.h"
|
| 97 |
+
#undef EXCLUDE_FROM_RTC
|
| 98 |
+
#endif /* !__CUDACC_RTC__ */
|
| 99 |
+
#include "crt/host_defines.h"
|
| 100 |
+
#ifdef __CUDACC_RTC__
|
| 101 |
+
#include "target"
|
| 102 |
+
#endif /* defined(__CUDACC_RTC__) */
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
#include "vector_functions.h"
|
| 106 |
+
|
| 107 |
+
#if defined(__CUDACC__)
|
| 108 |
+
|
| 109 |
+
#if defined(__CUDACC_RTC__)
|
| 110 |
+
#include "nvrtc_device_runtime.h"
|
| 111 |
+
#include "crt/device_functions.h"
|
| 112 |
+
#include "crt/common_functions.h"
|
| 113 |
+
#include "device_launch_parameters.h"
|
| 114 |
+
|
| 115 |
+
#else /* !__CUDACC_RTC__ */
|
| 116 |
+
#define EXCLUDE_FROM_RTC
|
| 117 |
+
#include "crt/common_functions.h"
|
| 118 |
+
#include "crt/device_functions.h"
|
| 119 |
+
#include "device_launch_parameters.h"
|
| 120 |
+
|
| 121 |
+
#if defined(__CUDACC_EXTENDED_LAMBDA__)
|
| 122 |
+
#include <functional>
|
| 123 |
+
#include <utility>
|
| 124 |
+
struct __device_builtin__ __nv_lambda_preheader_injection { };
|
| 125 |
+
#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */
|
| 126 |
+
|
| 127 |
+
#undef EXCLUDE_FROM_RTC
|
| 128 |
+
#endif /* __CUDACC_RTC__ */
|
| 129 |
+
|
| 130 |
+
#endif /* __CUDACC__ */
|
| 131 |
+
|
| 132 |
+
/** \cond impl_private */
|
| 133 |
+
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 134 |
+
#define __CUDA_DEPRECATED
|
| 135 |
+
#elif defined(_MSC_VER)
|
| 136 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 137 |
+
#elif defined(__GNUC__)
|
| 138 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 139 |
+
#else
|
| 140 |
+
#define __CUDA_DEPRECATED
|
| 141 |
+
#endif
|
| 142 |
+
/** \endcond impl_private */
|
| 143 |
+
|
| 144 |
+
#define EXCLUDE_FROM_RTC
|
| 145 |
+
#if defined(__cplusplus) && !defined(__CUDACC_RTC__)
|
| 146 |
+
|
| 147 |
+
#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900))
|
| 148 |
+
#include <utility>
|
| 149 |
+
#endif
|
| 150 |
+
|
| 151 |
+
/*******************************************************************************
|
| 152 |
+
* *
|
| 153 |
+
* *
|
| 154 |
+
* *
|
| 155 |
+
*******************************************************************************/
|
| 156 |
+
|
| 157 |
+
/**
|
| 158 |
+
* \addtogroup CUDART_HIGHLEVEL
|
| 159 |
+
* @{
|
| 160 |
+
*/
|
| 161 |
+
|
| 162 |
+
/**
|
| 163 |
+
*\brief Launches a device function
|
| 164 |
+
*
|
| 165 |
+
* The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
|
| 166 |
+
* × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
|
| 167 |
+
* \p blockDim.y × \p blockDim.z) threads.
|
| 168 |
+
*
|
| 169 |
+
* If the kernel has N parameters the \p args should point to array of N pointers.
|
| 170 |
+
* Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
|
| 171 |
+
* of memory from which the actual parameter will be copied.
|
| 172 |
+
*
|
| 173 |
+
* \p sharedMem sets the amount of dynamic shared memory that will be available to
|
| 174 |
+
* each thread block.
|
| 175 |
+
*
|
| 176 |
+
* \p stream specifies a stream the invocation is associated to.
|
| 177 |
+
*
|
| 178 |
+
* \param func - Device function symbol
|
| 179 |
+
* \param gridDim - Grid dimentions
|
| 180 |
+
* \param blockDim - Block dimentions
|
| 181 |
+
* \param args - Arguments
|
| 182 |
+
* \param sharedMem - Shared memory (defaults to 0)
|
| 183 |
+
* \param stream - Stream identifier (defaults to NULL)
|
| 184 |
+
*
|
| 185 |
+
* \return
|
| 186 |
+
* ::cudaSuccess,
|
| 187 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 188 |
+
* ::cudaErrorInvalidConfiguration,
|
| 189 |
+
* ::cudaErrorLaunchFailure,
|
| 190 |
+
* ::cudaErrorLaunchTimeout,
|
| 191 |
+
* ::cudaErrorLaunchOutOfResources,
|
| 192 |
+
* ::cudaErrorSharedObjectInitFailed,
|
| 193 |
+
* ::cudaErrorInvalidPtx,
|
| 194 |
+
* ::cudaErrorUnsupportedPtxVersion,
|
| 195 |
+
* ::cudaErrorNoKernelImageForDevice,
|
| 196 |
+
* ::cudaErrorJitCompilerNotFound,
|
| 197 |
+
* ::cudaErrorJitCompilationDisabled
|
| 198 |
+
* \notefnerr
|
| 199 |
+
* \note_async
|
| 200 |
+
* \note_null_stream
|
| 201 |
+
* \note_init_rt
|
| 202 |
+
* \note_callback
|
| 203 |
+
*
|
| 204 |
+
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)"
|
| 205 |
+
*/
|
| 206 |
+
template<class T>
|
| 207 |
+
static __inline__ __host__ cudaError_t cudaLaunchKernel(
|
| 208 |
+
const T *func,
|
| 209 |
+
dim3 gridDim,
|
| 210 |
+
dim3 blockDim,
|
| 211 |
+
void **args,
|
| 212 |
+
size_t sharedMem = 0,
|
| 213 |
+
cudaStream_t stream = 0
|
| 214 |
+
)
|
| 215 |
+
{
|
| 216 |
+
return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900)) || defined(__DOXYGEN_ONLY__)
|
| 221 |
+
/**
|
| 222 |
+
* \brief Launches a CUDA function with launch-time configuration
|
| 223 |
+
*
|
| 224 |
+
* Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
|
| 225 |
+
* × \p config->gridDim.y × \p config->gridDim.z) grid of blocks.
|
| 226 |
+
* Each block contains \p config->blockDim (\p config->blockDim.x ×
|
| 227 |
+
* \p config->blockDim.y × \p config->blockDim.z) threads.
|
| 228 |
+
*
|
| 229 |
+
* \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
|
| 230 |
+
* will be available to each thread block.
|
| 231 |
+
*
|
| 232 |
+
* \p config->stream specifies a stream the invocation is associated to.
|
| 233 |
+
*
|
| 234 |
+
* Configuration beyond grid and block dimensions, dynamic shared memory size,
|
| 235 |
+
* and stream can be provided with the following two fields of \p config:
|
| 236 |
+
*
|
| 237 |
+
* \p config->attrs is an array of \p config->numAttrs contiguous
|
| 238 |
+
* ::cudaLaunchAttribute elements. The value of this pointer is not considered
|
| 239 |
+
* if \p config->numAttrs is zero. However, in that case, it is recommended to
|
| 240 |
+
* set the pointer to NULL.
|
| 241 |
+
* \p config->numAttrs is the number of attributes populating the first
|
| 242 |
+
* \p config->numAttrs positions of the \p config->attrs array.
|
| 243 |
+
*
|
| 244 |
+
* The kernel arguments should be passed as arguments to this function via the
|
| 245 |
+
* \p args parameter pack.
|
| 246 |
+
*
|
| 247 |
+
* The C API version of this function, \p cudaLaunchKernelExC, is also available
|
| 248 |
+
* for pre-C++11 compilers and for use cases where the ability to pass kernel
|
| 249 |
+
* parameters via void* array is preferable.
|
| 250 |
+
*
|
| 251 |
+
* \param config - Launch configuration
|
| 252 |
+
* \param func - Kernel to launch
|
| 253 |
+
* \param args - Parameter pack of kernel parameters
|
| 254 |
+
*
|
| 255 |
+
* \return
|
| 256 |
+
* ::cudaSuccess,
|
| 257 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 258 |
+
* ::cudaErrorInvalidConfiguration,
|
| 259 |
+
* ::cudaErrorLaunchFailure,
|
| 260 |
+
* ::cudaErrorLaunchTimeout,
|
| 261 |
+
* ::cudaErrorLaunchOutOfResources,
|
| 262 |
+
* ::cudaErrorSharedObjectInitFailed,
|
| 263 |
+
* ::cudaErrorInvalidPtx,
|
| 264 |
+
* ::cudaErrorUnsupportedPtxVersion,
|
| 265 |
+
* ::cudaErrorNoKernelImageForDevice,
|
| 266 |
+
* ::cudaErrorJitCompilerNotFound,
|
| 267 |
+
* ::cudaErrorJitCompilationDisabled
|
| 268 |
+
* \note_null_stream
|
| 269 |
+
* \notefnerr
|
| 270 |
+
* \note_init_rt
|
| 271 |
+
* \note_callback
|
| 272 |
+
*
|
| 273 |
+
* \sa
|
| 274 |
+
* \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)",
|
| 275 |
+
* ::cuLaunchKernelEx
|
| 276 |
+
*/
|
| 277 |
+
template<typename... ExpTypes, typename... ActTypes>
|
| 278 |
+
static __inline__ __host__ cudaError_t cudaLaunchKernelEx(
|
| 279 |
+
const cudaLaunchConfig_t *config,
|
| 280 |
+
void (*kernel)(ExpTypes...),
|
| 281 |
+
ActTypes &&... args
|
| 282 |
+
)
|
| 283 |
+
{
|
| 284 |
+
return [&](ExpTypes... coercedArgs){
|
| 285 |
+
void *pArgs[] = { &coercedArgs... };
|
| 286 |
+
return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs);
|
| 287 |
+
}(std::forward<ActTypes>(args)...);
|
| 288 |
+
}
|
| 289 |
+
#endif
|
| 290 |
+
|
| 291 |
+
/**
|
| 292 |
+
*\brief Launches a device function
|
| 293 |
+
*
|
| 294 |
+
* The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
|
| 295 |
+
* × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
|
| 296 |
+
* \p blockDim.y × \p blockDim.z) threads.
|
| 297 |
+
*
|
| 298 |
+
* The device on which this kernel is invoked must have a non-zero value for
|
| 299 |
+
* the device attribute ::cudaDevAttrCooperativeLaunch.
|
| 300 |
+
*
|
| 301 |
+
* The total number of blocks launched cannot exceed the maximum number of blocks per
|
| 302 |
+
* multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
|
| 303 |
+
* ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
|
| 304 |
+
* as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
|
| 305 |
+
*
|
| 306 |
+
* The kernel cannot make use of CUDA dynamic parallelism.
|
| 307 |
+
*
|
| 308 |
+
* If the kernel has N parameters the \p args should point to array of N pointers.
|
| 309 |
+
* Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
|
| 310 |
+
* of memory from which the actual parameter will be copied.
|
| 311 |
+
*
|
| 312 |
+
* \p sharedMem sets the amount of dynamic shared memory that will be available to
|
| 313 |
+
* each thread block.
|
| 314 |
+
*
|
| 315 |
+
* \p stream specifies a stream the invocation is associated to.
|
| 316 |
+
*
|
| 317 |
+
* \param func - Device function symbol
|
| 318 |
+
* \param gridDim - Grid dimentions
|
| 319 |
+
* \param blockDim - Block dimentions
|
| 320 |
+
* \param args - Arguments
|
| 321 |
+
* \param sharedMem - Shared memory (defaults to 0)
|
| 322 |
+
* \param stream - Stream identifier (defaults to NULL)
|
| 323 |
+
*
|
| 324 |
+
* \return
|
| 325 |
+
* ::cudaSuccess,
|
| 326 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 327 |
+
* ::cudaErrorInvalidConfiguration,
|
| 328 |
+
* ::cudaErrorLaunchFailure,
|
| 329 |
+
* ::cudaErrorLaunchTimeout,
|
| 330 |
+
* ::cudaErrorLaunchOutOfResources,
|
| 331 |
+
* ::cudaErrorSharedObjectInitFailed
|
| 332 |
+
* \notefnerr
|
| 333 |
+
* \note_async
|
| 334 |
+
* \note_null_stream
|
| 335 |
+
* \note_init_rt
|
| 336 |
+
* \note_callback
|
| 337 |
+
*
|
| 338 |
+
* \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)"
|
| 339 |
+
*/
|
| 340 |
+
template<class T>
|
| 341 |
+
static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel(
|
| 342 |
+
const T *func,
|
| 343 |
+
dim3 gridDim,
|
| 344 |
+
dim3 blockDim,
|
| 345 |
+
void **args,
|
| 346 |
+
size_t sharedMem = 0,
|
| 347 |
+
cudaStream_t stream = 0
|
| 348 |
+
)
|
| 349 |
+
{
|
| 350 |
+
return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/**
|
| 354 |
+
* \brief \hl Creates an event object with the specified flags
|
| 355 |
+
*
|
| 356 |
+
* Creates an event object with the specified flags. Valid flags include:
|
| 357 |
+
* - ::cudaEventDefault: Default event creation flag.
|
| 358 |
+
* - ::cudaEventBlockingSync: Specifies that event should use blocking
|
| 359 |
+
* synchronization. A host thread that uses ::cudaEventSynchronize() to wait
|
| 360 |
+
* on an event created with this flag will block until the event actually
|
| 361 |
+
* completes.
|
| 362 |
+
* - ::cudaEventDisableTiming: Specifies that the created event does not need
|
| 363 |
+
* to record timing data. Events created with this flag specified and
|
| 364 |
+
* the ::cudaEventBlockingSync flag not specified will provide the best
|
| 365 |
+
* performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
|
| 366 |
+
*
|
| 367 |
+
* \param event - Newly created event
|
| 368 |
+
* \param flags - Flags for new event
|
| 369 |
+
*
|
| 370 |
+
* \return
|
| 371 |
+
* ::cudaSuccess,
|
| 372 |
+
* ::cudaErrorInvalidValue,
|
| 373 |
+
* ::cudaErrorLaunchFailure,
|
| 374 |
+
* ::cudaErrorMemoryAllocation
|
| 375 |
+
* \notefnerr
|
| 376 |
+
* \note_init_rt
|
| 377 |
+
* \note_callback
|
| 378 |
+
*
|
| 379 |
+
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
|
| 380 |
+
* ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
|
| 381 |
+
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
|
| 382 |
+
* ::cudaStreamWaitEvent
|
| 383 |
+
*/
|
| 384 |
+
static __inline__ __host__ cudaError_t cudaEventCreate(
|
| 385 |
+
cudaEvent_t *event,
|
| 386 |
+
unsigned int flags
|
| 387 |
+
)
|
| 388 |
+
{
|
| 389 |
+
return ::cudaEventCreateWithFlags(event, flags);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
/**
|
| 393 |
+
* \brief Creates an executable graph from a graph
|
| 394 |
+
*
|
| 395 |
+
* Instantiates \p graph as an executable graph. The graph is validated for any
|
| 396 |
+
* structural constraints or intra-node constraints which were not previously
|
| 397 |
+
* validated. If instantiation is successful, a handle to the instantiated graph
|
| 398 |
+
* is returned in \p pGraphExec.
|
| 399 |
+
*
|
| 400 |
+
* If there are any errors, diagnostic information may be returned in \p pErrorNode and
|
| 401 |
+
* \p pLogBuffer. This is the primary way to inspect instantiation errors. The output
|
| 402 |
+
* will be null terminated unless the diagnostics overflow
|
| 403 |
+
* the buffer. In this case, they will be truncated, and the last byte can be
|
| 404 |
+
* inspected to determine if truncation occurred.
|
| 405 |
+
*
|
| 406 |
+
* \param pGraphExec - Returns instantiated graph
|
| 407 |
+
* \param graph - Graph to instantiate
|
| 408 |
+
* \param pErrorNode - In case of an instantiation error, this may be modified to
|
| 409 |
+
* indicate a node contributing to the error
|
| 410 |
+
* \param pLogBuffer - A character buffer to store diagnostic messages
|
| 411 |
+
* \param bufferSize - Size of the log buffer in bytes
|
| 412 |
+
*
|
| 413 |
+
* \return
|
| 414 |
+
* ::cudaSuccess,
|
| 415 |
+
* ::cudaErrorInvalidValue
|
| 416 |
+
* \note_graph_thread_safety
|
| 417 |
+
* \notefnerr
|
| 418 |
+
* \note_init_rt
|
| 419 |
+
* \note_callback
|
| 420 |
+
*
|
| 421 |
+
* \sa
|
| 422 |
+
* ::cudaGraphInstantiateWithFlags,
|
| 423 |
+
* ::cudaGraphCreate,
|
| 424 |
+
* ::cudaGraphUpload,
|
| 425 |
+
* ::cudaGraphLaunch,
|
| 426 |
+
* ::cudaGraphExecDestroy
|
| 427 |
+
*/
|
| 428 |
+
static __inline__ __host__ cudaError_t cudaGraphInstantiate(
|
| 429 |
+
cudaGraphExec_t *pGraphExec,
|
| 430 |
+
cudaGraph_t graph,
|
| 431 |
+
cudaGraphNode_t *pErrorNode,
|
| 432 |
+
char *pLogBuffer,
|
| 433 |
+
size_t bufferSize
|
| 434 |
+
)
|
| 435 |
+
{
|
| 436 |
+
(void)pErrorNode;
|
| 437 |
+
(void)pLogBuffer;
|
| 438 |
+
(void)bufferSize;
|
| 439 |
+
return ::cudaGraphInstantiate(pGraphExec, graph, 0);
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
/**
|
| 443 |
+
* \brief \hl Allocates page-locked memory on the host
|
| 444 |
+
*
|
| 445 |
+
* Allocates \p size bytes of host memory that is page-locked and accessible
|
| 446 |
+
* to the device. The driver tracks the virtual memory ranges allocated with
|
| 447 |
+
* this function and automatically accelerates calls to functions such as
|
| 448 |
+
* ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
|
| 449 |
+
* can be read or written with much higher bandwidth than pageable memory
|
| 450 |
+
* obtained with functions such as ::malloc(). Allocating excessive amounts of
|
| 451 |
+
* pinned memory may degrade system performance, since it reduces the amount
|
| 452 |
+
* of memory available to the system for paging. As a result, this function is
|
| 453 |
+
* best used sparingly to allocate staging areas for data exchange between host
|
| 454 |
+
* and device.
|
| 455 |
+
*
|
| 456 |
+
* The \p flags parameter enables different options to be specified that affect
|
| 457 |
+
* the allocation, as follows.
|
| 458 |
+
* - ::cudaHostAllocDefault: This flag's value is defined to be 0.
|
| 459 |
+
* - ::cudaHostAllocPortable: The memory returned by this call will be
|
| 460 |
+
* considered as pinned memory by all CUDA contexts, not just the one that
|
| 461 |
+
* performed the allocation.
|
| 462 |
+
* - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
|
| 463 |
+
* The device pointer to the memory may be obtained by calling
|
| 464 |
+
* ::cudaHostGetDevicePointer().
|
| 465 |
+
* - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
|
| 466 |
+
* WC memory can be transferred across the PCI Express bus more quickly on some
|
| 467 |
+
* system configurations, but cannot be read efficiently by most CPUs. WC
|
| 468 |
+
* memory is a good option for buffers that will be written by the CPU and read
|
| 469 |
+
* by the device via mapped pinned memory or host->device transfers.
|
| 470 |
+
*
|
| 471 |
+
* All of these flags are orthogonal to one another: a developer may allocate
|
| 472 |
+
* memory that is portable, mapped and/or write-combined with no restrictions.
|
| 473 |
+
*
|
| 474 |
+
* ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
|
| 475 |
+
* flag in order for the ::cudaHostAllocMapped flag to have any effect.
|
| 476 |
+
*
|
| 477 |
+
* The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
|
| 478 |
+
* that do not support mapped pinned memory. The failure is deferred to
|
| 479 |
+
* ::cudaHostGetDevicePointer() because the memory may be mapped into other
|
| 480 |
+
* CUDA contexts via the ::cudaHostAllocPortable flag.
|
| 481 |
+
*
|
| 482 |
+
* Memory allocated by this function must be freed with ::cudaFreeHost().
|
| 483 |
+
*
|
| 484 |
+
* \param ptr - Device pointer to allocated memory
|
| 485 |
+
* \param size - Requested allocation size in bytes
|
| 486 |
+
* \param flags - Requested properties of allocated memory
|
| 487 |
+
*
|
| 488 |
+
* \return
|
| 489 |
+
* ::cudaSuccess,
|
| 490 |
+
* ::cudaErrorMemoryAllocation
|
| 491 |
+
* \notefnerr
|
| 492 |
+
* \note_init_rt
|
| 493 |
+
* \note_callback
|
| 494 |
+
*
|
| 495 |
+
* \sa ::cudaSetDeviceFlags,
|
| 496 |
+
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
|
| 497 |
+
* ::cudaFreeHost, ::cudaHostAlloc
|
| 498 |
+
*/
|
| 499 |
+
static __inline__ __host__ cudaError_t cudaMallocHost(
|
| 500 |
+
void **ptr,
|
| 501 |
+
size_t size,
|
| 502 |
+
unsigned int flags
|
| 503 |
+
)
|
| 504 |
+
{
|
| 505 |
+
return ::cudaHostAlloc(ptr, size, flags);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
template<class T>
|
| 509 |
+
static __inline__ __host__ cudaError_t cudaHostAlloc(
|
| 510 |
+
T **ptr,
|
| 511 |
+
size_t size,
|
| 512 |
+
unsigned int flags
|
| 513 |
+
)
|
| 514 |
+
{
|
| 515 |
+
return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
template<class T>
|
| 519 |
+
static __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
|
| 520 |
+
T **pDevice,
|
| 521 |
+
void *pHost,
|
| 522 |
+
unsigned int flags
|
| 523 |
+
)
|
| 524 |
+
{
|
| 525 |
+
return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
/**
|
| 529 |
+
* \brief Allocates memory that will be automatically managed by the Unified Memory system
|
| 530 |
+
*
|
| 531 |
+
* Allocates \p size bytes of managed memory on the device and returns in
|
| 532 |
+
* \p *devPtr a pointer to the allocated memory. If the device doesn't support
|
| 533 |
+
* allocating managed memory, ::cudaErrorNotSupported is returned. Support
|
| 534 |
+
* for managed memory can be queried using the device attribute
|
| 535 |
+
* ::cudaDevAttrManagedMemory. The allocated memory is suitably
|
| 536 |
+
* aligned for any kind of variable. The memory is not cleared. If \p size
|
| 537 |
+
* is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
|
| 538 |
+
* is valid on the CPU and on all GPUs in the system that support managed memory.
|
| 539 |
+
* All accesses to this pointer must obey the Unified Memory programming model.
|
| 540 |
+
*
|
| 541 |
+
* \p flags specifies the default stream association for this allocation.
|
| 542 |
+
* \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
|
| 543 |
+
* default value for \p flags is ::cudaMemAttachGlobal.
|
| 544 |
+
* If ::cudaMemAttachGlobal is specified, then this memory is accessible from
|
| 545 |
+
* any stream on any device. If ::cudaMemAttachHost is specified, then the
|
| 546 |
+
* allocation should not be accessed from devices that have a zero value for the
|
| 547 |
+
* device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
|
| 548 |
+
* ::cudaStreamAttachMemAsync will be required to enable access on such devices.
|
| 549 |
+
*
|
| 550 |
+
* If the association is later changed via ::cudaStreamAttachMemAsync to
|
| 551 |
+
* a single stream, the default association, as specifed during ::cudaMallocManaged,
|
| 552 |
+
* is restored when that stream is destroyed. For __managed__ variables, the
|
| 553 |
+
* default association is always ::cudaMemAttachGlobal. Note that destroying a
|
| 554 |
+
* stream is an asynchronous operation, and as a result, the change to default
|
| 555 |
+
* association won't happen until all work in the stream has completed.
|
| 556 |
+
*
|
| 557 |
+
* Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
|
| 558 |
+
*
|
| 559 |
+
* Device memory oversubscription is possible for GPUs that have a non-zero value for the
|
| 560 |
+
* device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
|
| 561 |
+
* such GPUs may be evicted from device memory to host memory at any time by the Unified
|
| 562 |
+
* Memory driver in order to make room for other allocations.
|
| 563 |
+
*
|
| 564 |
+
* In a multi-GPU system where all GPUs have a non-zero value for the device attribute
|
| 565 |
+
* ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
|
| 566 |
+
* API returns and instead may be populated on access. In such systems, managed memory can
|
| 567 |
+
* migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
|
| 568 |
+
* maintain data locality and prevent excessive page faults to the extent possible. The application
|
| 569 |
+
* can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
|
| 570 |
+
* can also explicitly migrate memory to a desired processor's memory via
|
| 571 |
+
* ::cudaMemPrefetchAsync.
|
| 572 |
+
*
|
| 573 |
+
* In a multi-GPU system where all of the GPUs have a zero value for the device attribute
|
| 574 |
+
* ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
|
| 575 |
+
* with each other, the physical storage for managed memory is created on the GPU which is active
|
| 576 |
+
* at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
|
| 577 |
+
* bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
|
| 578 |
+
* memory among such GPUs.
|
| 579 |
+
*
|
| 580 |
+
* In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
|
| 581 |
+
* where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
|
| 582 |
+
* is zero for at least one of those GPUs, the location chosen for physical storage of managed
|
| 583 |
+
* memory is system-dependent.
|
| 584 |
+
* - On Linux, the location chosen will be device memory as long as the current set of active
|
| 585 |
+
* contexts are on devices that either have peer-to-peer support with each other or have a
|
| 586 |
+
* non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
|
| 587 |
+
* If there is an active context on a GPU that does not have a non-zero value for that device
|
| 588 |
+
* attribute and it does not have peer-to-peer support with the other devices that have active
|
| 589 |
+
* contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
|
| 590 |
+
* Note that this means that managed memory that is located in device memory is migrated to
|
| 591 |
+
* host memory if a new context is created on a GPU that doesn't have a non-zero value for
|
| 592 |
+
* the device attribute and does not support peer-to-peer with at least one of the other devices
|
| 593 |
+
* that has an active context. This in turn implies that context creation may fail if there is
|
| 594 |
+
* insufficient host memory to migrate all managed allocations.
|
| 595 |
+
* - On Windows, the physical storage is always created in 'zero-copy' or host memory.
|
| 596 |
+
* All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
|
| 597 |
+
* circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
|
| 598 |
+
* restrict CUDA to only use those GPUs that have peer-to-peer support.
|
| 599 |
+
* Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
|
| 600 |
+
* value to force the driver to always use device memory for physical storage.
|
| 601 |
+
* When this environment variable is set to a non-zero value, all devices used in
|
| 602 |
+
* that process that support managed memory have to be peer-to-peer compatible
|
| 603 |
+
* with each other. The error ::cudaErrorInvalidDevice will be returned if a device
|
| 604 |
+
* that supports managed memory is used and it is not peer-to-peer compatible with
|
| 605 |
+
* any of the other managed memory supporting devices that were previously used in
|
| 606 |
+
* that process, even if ::cudaDeviceReset has been called on those devices. These
|
| 607 |
+
* environment variables are described in the CUDA programming guide under the
|
| 608 |
+
* "CUDA environment variables" section.
|
| 609 |
+
* - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
|
| 610 |
+
*
|
| 611 |
+
* \param devPtr - Pointer to allocated device memory
|
| 612 |
+
* \param size - Requested allocation size in bytes
|
| 613 |
+
* \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
|
| 614 |
+
*
|
| 615 |
+
* \return
|
| 616 |
+
* ::cudaSuccess,
|
| 617 |
+
* ::cudaErrorMemoryAllocation,
|
| 618 |
+
* ::cudaErrorNotSupported,
|
| 619 |
+
* ::cudaErrorInvalidValue
|
| 620 |
+
* \note_init_rt
|
| 621 |
+
* \note_callback
|
| 622 |
+
*
|
| 623 |
+
* \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
|
| 624 |
+
* ::cudaMalloc3D, ::cudaMalloc3DArray,
|
| 625 |
+
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
|
| 626 |
+
* ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
|
| 627 |
+
*/
|
| 628 |
+
template<class T>
|
| 629 |
+
static __inline__ __host__ cudaError_t cudaMallocManaged(
|
| 630 |
+
T **devPtr,
|
| 631 |
+
size_t size,
|
| 632 |
+
unsigned int flags = cudaMemAttachGlobal
|
| 633 |
+
)
|
| 634 |
+
{
|
| 635 |
+
return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
/**
|
| 639 |
+
* \brief Advise about the usage of a given memory range.
|
| 640 |
+
*
|
| 641 |
+
* This is an alternate spelling for cudaMemAdvise made available through operator overloading.
|
| 642 |
+
*
|
| 643 |
+
* \sa ::cudaMemAdvise,
|
| 644 |
+
* \ref ::cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvise advice, struct cudaMemLocation location) "cudaMemAdvise (C API)"
|
| 645 |
+
*/
|
| 646 |
+
template<class T>
|
| 647 |
+
cudaError_t cudaMemAdvise(
|
| 648 |
+
T *devPtr,
|
| 649 |
+
size_t count,
|
| 650 |
+
enum cudaMemoryAdvise advice,
|
| 651 |
+
struct cudaMemLocation location
|
| 652 |
+
)
|
| 653 |
+
{
|
| 654 |
+
return ::cudaMemAdvise_v2((const void *)devPtr, count, advice, location);
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
template<class T>
|
| 658 |
+
static __inline__ __host__ cudaError_t cudaMemPrefetchAsync(
|
| 659 |
+
T *devPtr,
|
| 660 |
+
size_t count,
|
| 661 |
+
struct cudaMemLocation location,
|
| 662 |
+
unsigned int flags,
|
| 663 |
+
cudaStream_t stream = 0
|
| 664 |
+
)
|
| 665 |
+
{
|
| 666 |
+
return ::cudaMemPrefetchAsync_v2((const void *)devPtr, count, location, flags, stream);
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
/**
|
| 670 |
+
* \brief Attach memory to a stream asynchronously
|
| 671 |
+
*
|
| 672 |
+
* Enqueues an operation in \p stream to specify stream association of
|
| 673 |
+
* \p length bytes of memory starting from \p devPtr. This function is a
|
| 674 |
+
* stream-ordered operation, meaning that it is dependent on, and will
|
| 675 |
+
* only take effect when, previous work in stream has completed. Any
|
| 676 |
+
* previous association is automatically replaced.
|
| 677 |
+
*
|
| 678 |
+
* \p devPtr must point to an one of the following types of memories:
|
| 679 |
+
* - managed memory declared using the __managed__ keyword or allocated with
|
| 680 |
+
* ::cudaMallocManaged.
|
| 681 |
+
* - a valid host-accessible region of system-allocated pageable memory. This
|
| 682 |
+
* type of memory may only be specified if the device associated with the
|
| 683 |
+
* stream reports a non-zero value for the device attribute
|
| 684 |
+
* ::cudaDevAttrPageableMemoryAccess.
|
| 685 |
+
*
|
| 686 |
+
* For managed allocations, \p length must be either zero or the entire
|
| 687 |
+
* allocation's size. Both indicate that the entire allocation's stream
|
| 688 |
+
* association is being changed. Currently, it is not possible to change stream
|
| 689 |
+
* association for a portion of a managed allocation.
|
| 690 |
+
*
|
| 691 |
+
* For pageable allocations, \p length must be non-zero.
|
| 692 |
+
*
|
| 693 |
+
* The stream association is specified using \p flags which must be
|
| 694 |
+
* one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
|
| 695 |
+
* The default value for \p flags is ::cudaMemAttachSingle
|
| 696 |
+
* If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
|
| 697 |
+
* by any stream on any device.
|
| 698 |
+
* If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
|
| 699 |
+
* that it won't access the memory on the device from any stream on a device that
|
| 700 |
+
* has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
|
| 701 |
+
* If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
|
| 702 |
+
* a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
|
| 703 |
+
* the program makes a guarantee that it will only access the memory on the device
|
| 704 |
+
* from \p stream. It is illegal to attach singly to the NULL stream, because the
|
| 705 |
+
* NULL stream is a virtual global stream and not a specific stream. An error will
|
| 706 |
+
* be returned in this case.
|
| 707 |
+
*
|
| 708 |
+
* When memory is associated with a single stream, the Unified Memory system will
|
| 709 |
+
* allow CPU access to this memory region so long as all operations in \p stream
|
| 710 |
+
* have completed, regardless of whether other streams are active. In effect,
|
| 711 |
+
* this constrains exclusive ownership of the managed memory region by
|
| 712 |
+
* an active GPU to per-stream activity instead of whole-GPU activity.
|
| 713 |
+
*
|
| 714 |
+
* Accessing memory on the device from streams that are not associated with
|
| 715 |
+
* it will produce undefined results. No error checking is performed by the
|
| 716 |
+
* Unified Memory system to ensure that kernels launched into other streams
|
| 717 |
+
* do not access this region.
|
| 718 |
+
*
|
| 719 |
+
* It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
|
| 720 |
+
* via events, synchronization or other means to ensure legal access to memory
|
| 721 |
+
* at all times. Data visibility and coherency will be changed appropriately
|
| 722 |
+
* for all kernels which follow a stream-association change.
|
| 723 |
+
*
|
| 724 |
+
* If \p stream is destroyed while data is associated with it, the association is
|
| 725 |
+
* removed and the association reverts to the default visibility of the allocation
|
| 726 |
+
* as specified at ::cudaMallocManaged. For __managed__ variables, the default
|
| 727 |
+
* association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
|
| 728 |
+
* asynchronous operation, and as a result, the change to default association won't
|
| 729 |
+
* happen until all work in the stream has completed.
|
| 730 |
+
*
|
| 731 |
+
* \param stream - Stream in which to enqueue the attach operation
|
| 732 |
+
* \param devPtr - Pointer to memory (must be a pointer to managed memory or
|
| 733 |
+
* to a valid host-accessible region of system-allocated
|
| 734 |
+
* memory)
|
| 735 |
+
* \param length - Length of memory (defaults to zero)
|
| 736 |
+
* \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
|
| 737 |
+
*
|
| 738 |
+
* \return
|
| 739 |
+
* ::cudaSuccess,
|
| 740 |
+
* ::cudaErrorNotReady,
|
| 741 |
+
* ::cudaErrorInvalidValue,
|
| 742 |
+
* ::cudaErrorInvalidResourceHandle
|
| 743 |
+
* \notefnerr
|
| 744 |
+
* \note_init_rt
|
| 745 |
+
* \note_callback
|
| 746 |
+
*
|
| 747 |
+
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
|
| 748 |
+
*/
|
| 749 |
+
template<class T>
|
| 750 |
+
static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
|
| 751 |
+
cudaStream_t stream,
|
| 752 |
+
T *devPtr,
|
| 753 |
+
size_t length = 0,
|
| 754 |
+
unsigned int flags = cudaMemAttachSingle
|
| 755 |
+
)
|
| 756 |
+
{
|
| 757 |
+
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
|
| 758 |
+
}
|
| 759 |
+
|
| 760 |
+
template<class T>
|
| 761 |
+
static __inline__ __host__ cudaError_t cudaMalloc(
|
| 762 |
+
T **devPtr,
|
| 763 |
+
size_t size
|
| 764 |
+
)
|
| 765 |
+
{
|
| 766 |
+
return ::cudaMalloc((void**)(void*)devPtr, size);
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
template<class T>
|
| 770 |
+
static __inline__ __host__ cudaError_t cudaMallocHost(
|
| 771 |
+
T **ptr,
|
| 772 |
+
size_t size,
|
| 773 |
+
unsigned int flags = 0
|
| 774 |
+
)
|
| 775 |
+
{
|
| 776 |
+
return cudaMallocHost((void**)(void*)ptr, size, flags);
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
template<class T>
|
| 780 |
+
static __inline__ __host__ cudaError_t cudaMallocPitch(
|
| 781 |
+
T **devPtr,
|
| 782 |
+
size_t *pitch,
|
| 783 |
+
size_t width,
|
| 784 |
+
size_t height
|
| 785 |
+
)
|
| 786 |
+
{
|
| 787 |
+
return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
/**
|
| 791 |
+
* \brief Allocate from a pool
|
| 792 |
+
*
|
| 793 |
+
* This is an alternate spelling for cudaMallocFromPoolAsync
|
| 794 |
+
* made available through operator overloading.
|
| 795 |
+
*
|
| 796 |
+
* \sa ::cudaMallocFromPoolAsync,
|
| 797 |
+
* \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream) "cudaMallocAsync (C API)"
|
| 798 |
+
*/
|
| 799 |
+
static __inline__ __host__ cudaError_t cudaMallocAsync(
|
| 800 |
+
void **ptr,
|
| 801 |
+
size_t size,
|
| 802 |
+
cudaMemPool_t memPool,
|
| 803 |
+
cudaStream_t stream
|
| 804 |
+
)
|
| 805 |
+
{
|
| 806 |
+
return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream);
|
| 807 |
+
}
|
| 808 |
+
|
| 809 |
+
template<class T>
|
| 810 |
+
static __inline__ __host__ cudaError_t cudaMallocAsync(
|
| 811 |
+
T **ptr,
|
| 812 |
+
size_t size,
|
| 813 |
+
cudaMemPool_t memPool,
|
| 814 |
+
cudaStream_t stream
|
| 815 |
+
)
|
| 816 |
+
{
|
| 817 |
+
return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
template<class T>
|
| 821 |
+
static __inline__ __host__ cudaError_t cudaMallocAsync(
|
| 822 |
+
T **ptr,
|
| 823 |
+
size_t size,
|
| 824 |
+
cudaStream_t stream
|
| 825 |
+
)
|
| 826 |
+
{
|
| 827 |
+
return ::cudaMallocAsync((void**)(void*)ptr, size, stream);
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
template<class T>
|
| 831 |
+
static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync(
|
| 832 |
+
T **ptr,
|
| 833 |
+
size_t size,
|
| 834 |
+
cudaMemPool_t memPool,
|
| 835 |
+
cudaStream_t stream
|
| 836 |
+
)
|
| 837 |
+
{
|
| 838 |
+
return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
#if defined(__CUDACC__)
|
| 842 |
+
|
| 843 |
+
/**
|
| 844 |
+
* \brief \hl Copies data to the given symbol on the device
|
| 845 |
+
*
|
| 846 |
+
* Copies \p count bytes from the memory area pointed to by \p src
|
| 847 |
+
* to the memory area \p offset bytes from the start of symbol
|
| 848 |
+
* \p symbol. The memory areas may not overlap. \p symbol is a variable that
|
| 849 |
+
* resides in global or constant memory space. \p kind can be either
|
| 850 |
+
* ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
|
| 851 |
+
*
|
| 852 |
+
* \param symbol - Device symbol reference
|
| 853 |
+
* \param src - Source memory address
|
| 854 |
+
* \param count - Size in bytes to copy
|
| 855 |
+
* \param offset - Offset from start of symbol in bytes
|
| 856 |
+
* \param kind - Type of transfer
|
| 857 |
+
*
|
| 858 |
+
* \return
|
| 859 |
+
* ::cudaSuccess,
|
| 860 |
+
* ::cudaErrorInvalidValue,
|
| 861 |
+
* ::cudaErrorInvalidSymbol,
|
| 862 |
+
* ::cudaErrorInvalidMemcpyDirection,
|
| 863 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 864 |
+
* \notefnerr
|
| 865 |
+
* \note_sync
|
| 866 |
+
* \note_string_api_deprecation
|
| 867 |
+
* \note_init_rt
|
| 868 |
+
* \note_callback
|
| 869 |
+
*
|
| 870 |
+
* \sa ::cudaMemcpy, ::cudaMemcpy2D,
|
| 871 |
+
* ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
|
| 872 |
+
* ::cudaMemcpy2DArrayToArray,
|
| 873 |
+
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
|
| 874 |
+
* ::cudaMemcpy2DToArrayAsync,
|
| 875 |
+
* ::cudaMemcpy2DFromArrayAsync,
|
| 876 |
+
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
|
| 877 |
+
*/
|
| 878 |
+
template<class T>
|
| 879 |
+
static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
|
| 880 |
+
const T &symbol,
|
| 881 |
+
const void *src,
|
| 882 |
+
size_t count,
|
| 883 |
+
size_t offset = 0,
|
| 884 |
+
enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
|
| 885 |
+
)
|
| 886 |
+
{
|
| 887 |
+
return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
|
| 888 |
+
}
|
| 889 |
+
|
| 890 |
+
/**
|
| 891 |
+
* \brief \hl Copies data to the given symbol on the device
|
| 892 |
+
*
|
| 893 |
+
* Copies \p count bytes from the memory area pointed to by \p src
|
| 894 |
+
* to the memory area \p offset bytes from the start of symbol
|
| 895 |
+
* \p symbol. The memory areas may not overlap. \p symbol is a variable that
|
| 896 |
+
* resides in global or constant memory space. \p kind can be either
|
| 897 |
+
* ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
|
| 898 |
+
*
|
| 899 |
+
* ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
|
| 900 |
+
* the call may return before the copy is complete. The copy can optionally
|
| 901 |
+
* be associated to a stream by passing a non-zero \p stream argument. If
|
| 902 |
+
* \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
|
| 903 |
+
* may overlap with operations in other streams.
|
| 904 |
+
*
|
| 905 |
+
* \param symbol - Device symbol reference
|
| 906 |
+
* \param src - Source memory address
|
| 907 |
+
* \param count - Size in bytes to copy
|
| 908 |
+
* \param offset - Offset from start of symbol in bytes
|
| 909 |
+
* \param kind - Type of transfer
|
| 910 |
+
* \param stream - Stream identifier
|
| 911 |
+
*
|
| 912 |
+
* \return
|
| 913 |
+
* ::cudaSuccess,
|
| 914 |
+
* ::cudaErrorInvalidValue,
|
| 915 |
+
* ::cudaErrorInvalidSymbol,
|
| 916 |
+
* ::cudaErrorInvalidMemcpyDirection,
|
| 917 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 918 |
+
* \notefnerr
|
| 919 |
+
* \note_async
|
| 920 |
+
* \note_string_api_deprecation
|
| 921 |
+
* \note_init_rt
|
| 922 |
+
* \note_callback
|
| 923 |
+
*
|
| 924 |
+
* \sa ::cudaMemcpy, ::cudaMemcpy2D,
|
| 925 |
+
* ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
|
| 926 |
+
* ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
|
| 927 |
+
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
|
| 928 |
+
* ::cudaMemcpy2DToArrayAsync,
|
| 929 |
+
* ::cudaMemcpy2DFromArrayAsync,
|
| 930 |
+
* ::cudaMemcpyFromSymbolAsync
|
| 931 |
+
*/
|
| 932 |
+
template<class T>
|
| 933 |
+
static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
|
| 934 |
+
const T &symbol,
|
| 935 |
+
const void *src,
|
| 936 |
+
size_t count,
|
| 937 |
+
size_t offset = 0,
|
| 938 |
+
enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
|
| 939 |
+
cudaStream_t stream = 0
|
| 940 |
+
)
|
| 941 |
+
{
|
| 942 |
+
return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
/**
|
| 946 |
+
* \brief \hl Copies data from the given symbol on the device
|
| 947 |
+
*
|
| 948 |
+
* Copies \p count bytes from the memory area \p offset bytes
|
| 949 |
+
* from the start of symbol \p symbol to the memory area pointed to by \p dst.
|
| 950 |
+
* The memory areas may not overlap. \p symbol is a variable that
|
| 951 |
+
* resides in global or constant memory space. \p kind can be either
|
| 952 |
+
* ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
|
| 953 |
+
*
|
| 954 |
+
* \param dst - Destination memory address
|
| 955 |
+
* \param symbol - Device symbol reference
|
| 956 |
+
* \param count - Size in bytes to copy
|
| 957 |
+
* \param offset - Offset from start of symbol in bytes
|
| 958 |
+
* \param kind - Type of transfer
|
| 959 |
+
*
|
| 960 |
+
* \return
|
| 961 |
+
* ::cudaSuccess,
|
| 962 |
+
* ::cudaErrorInvalidValue,
|
| 963 |
+
* ::cudaErrorInvalidSymbol,
|
| 964 |
+
* ::cudaErrorInvalidMemcpyDirection,
|
| 965 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 966 |
+
* \notefnerr
|
| 967 |
+
* \note_sync
|
| 968 |
+
* \note_string_api_deprecation
|
| 969 |
+
* \note_init_rt
|
| 970 |
+
* \note_callback
|
| 971 |
+
*
|
| 972 |
+
* \sa ::cudaMemcpy, ::cudaMemcpy2D,
|
| 973 |
+
* ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
|
| 974 |
+
* ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
|
| 975 |
+
* ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
|
| 976 |
+
* ::cudaMemcpy2DToArrayAsync,
|
| 977 |
+
* ::cudaMemcpy2DFromArrayAsync,
|
| 978 |
+
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
|
| 979 |
+
*/
|
| 980 |
+
template<class T>
|
| 981 |
+
static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
|
| 982 |
+
void *dst,
|
| 983 |
+
const T &symbol,
|
| 984 |
+
size_t count,
|
| 985 |
+
size_t offset = 0,
|
| 986 |
+
enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost
|
| 987 |
+
)
|
| 988 |
+
{
|
| 989 |
+
return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
|
| 990 |
+
}
|
| 991 |
+
|
| 992 |
+
/**
|
| 993 |
+
* \brief \hl Copies data from the given symbol on the device
|
| 994 |
+
*
|
| 995 |
+
* Copies \p count bytes from the memory area \p offset bytes
|
| 996 |
+
* from the start of symbol \p symbol to the memory area pointed to by \p dst.
|
| 997 |
+
* The memory areas may not overlap. \p symbol is a variable that resides in
|
| 998 |
+
* global or constant memory space. \p kind can be either
|
| 999 |
+
* ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
|
| 1000 |
+
*
|
| 1001 |
+
* ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
|
| 1002 |
+
* the call may return before the copy is complete. The copy can optionally be
|
| 1003 |
+
* associated to a stream by passing a non-zero \p stream argument. If \p kind
|
| 1004 |
+
* is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
|
| 1005 |
+
* with operations in other streams.
|
| 1006 |
+
*
|
| 1007 |
+
* \param dst - Destination memory address
|
| 1008 |
+
* \param symbol - Device symbol reference
|
| 1009 |
+
* \param count - Size in bytes to copy
|
| 1010 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1011 |
+
* \param kind - Type of transfer
|
| 1012 |
+
* \param stream - Stream identifier
|
| 1013 |
+
*
|
| 1014 |
+
* \return
|
| 1015 |
+
* ::cudaSuccess,
|
| 1016 |
+
* ::cudaErrorInvalidValue,
|
| 1017 |
+
* ::cudaErrorInvalidSymbol,
|
| 1018 |
+
* ::cudaErrorInvalidMemcpyDirection,
|
| 1019 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 1020 |
+
* \notefnerr
|
| 1021 |
+
* \note_async
|
| 1022 |
+
* \note_string_api_deprecation
|
| 1023 |
+
* \note_init_rt
|
| 1024 |
+
* \note_callback
|
| 1025 |
+
*
|
| 1026 |
+
* \sa ::cudaMemcpy, ::cudaMemcpy2D,
|
| 1027 |
+
* ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
|
| 1028 |
+
* ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
|
| 1029 |
+
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
|
| 1030 |
+
* ::cudaMemcpy2DToArrayAsync,
|
| 1031 |
+
* ::cudaMemcpy2DFromArrayAsync,
|
| 1032 |
+
* ::cudaMemcpyToSymbolAsync
|
| 1033 |
+
*/
|
| 1034 |
+
template<class T>
|
| 1035 |
+
static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
|
| 1036 |
+
void *dst,
|
| 1037 |
+
const T &symbol,
|
| 1038 |
+
size_t count,
|
| 1039 |
+
size_t offset = 0,
|
| 1040 |
+
enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
|
| 1041 |
+
cudaStream_t stream = 0
|
| 1042 |
+
)
|
| 1043 |
+
{
|
| 1044 |
+
return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
|
| 1045 |
+
}
|
| 1046 |
+
|
| 1047 |
+
/**
|
| 1048 |
+
* \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
|
| 1049 |
+
*
|
| 1050 |
+
* Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
|
| 1051 |
+
* \p numDependencies dependencies specified via \p pDependencies.
|
| 1052 |
+
* It is possible for \p numDependencies to be 0, in which case the node will be placed
|
| 1053 |
+
* at the root of the graph. \p pDependencies may not have any duplicate entries.
|
| 1054 |
+
* A handle to the new node will be returned in \p pGraphNode.
|
| 1055 |
+
*
|
| 1056 |
+
* When the graph is launched, the node will copy \p count bytes from the memory area
|
| 1057 |
+
* pointed to by \p src to the memory area pointed to by \p offset bytes from the start
|
| 1058 |
+
* of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
|
| 1059 |
+
* resides in global or constant memory space. \p kind can be either
|
| 1060 |
+
* ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
|
| 1061 |
+
* Passing ::cudaMemcpyDefault is recommended, in which case the type of
|
| 1062 |
+
* transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
|
| 1063 |
+
* is only allowed on systems that support unified virtual addressing.
|
| 1064 |
+
*
|
| 1065 |
+
* Memcpy nodes have some additional restrictions with regards to managed memory, if the
|
| 1066 |
+
* system contains at least one device which has a zero value for the device attribute
|
| 1067 |
+
* ::cudaDevAttrConcurrentManagedAccess.
|
| 1068 |
+
*
|
| 1069 |
+
* \param pGraphNode - Returns newly created node
|
| 1070 |
+
* \param graph - Graph to which to add the node
|
| 1071 |
+
* \param pDependencies - Dependencies of the node
|
| 1072 |
+
* \param numDependencies - Number of dependencies
|
| 1073 |
+
* \param symbol - Device symbol address
|
| 1074 |
+
* \param src - Source memory address
|
| 1075 |
+
* \param count - Size in bytes to copy
|
| 1076 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1077 |
+
* \param kind - Type of transfer
|
| 1078 |
+
*
|
| 1079 |
+
* \return
|
| 1080 |
+
* ::cudaSuccess,
|
| 1081 |
+
* ::cudaErrorInvalidValue
|
| 1082 |
+
* \note_graph_thread_safety
|
| 1083 |
+
* \notefnerr
|
| 1084 |
+
* \note_init_rt
|
| 1085 |
+
* \note_callback
|
| 1086 |
+
*
|
| 1087 |
+
* \sa
|
| 1088 |
+
* ::cudaMemcpyToSymbol,
|
| 1089 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1090 |
+
* ::cudaGraphAddMemcpyNodeFromSymbol,
|
| 1091 |
+
* ::cudaGraphMemcpyNodeGetParams,
|
| 1092 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1093 |
+
* ::cudaGraphMemcpyNodeSetParamsToSymbol,
|
| 1094 |
+
* ::cudaGraphMemcpyNodeSetParamsFromSymbol,
|
| 1095 |
+
* ::cudaGraphCreate,
|
| 1096 |
+
* ::cudaGraphDestroyNode,
|
| 1097 |
+
* ::cudaGraphAddChildGraphNode,
|
| 1098 |
+
* ::cudaGraphAddEmptyNode,
|
| 1099 |
+
* ::cudaGraphAddKernelNode,
|
| 1100 |
+
* ::cudaGraphAddHostNode,
|
| 1101 |
+
* ::cudaGraphAddMemsetNode
|
| 1102 |
+
*/
|
| 1103 |
+
template<class T>
|
| 1104 |
+
static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol(
|
| 1105 |
+
cudaGraphNode_t *pGraphNode,
|
| 1106 |
+
cudaGraph_t graph,
|
| 1107 |
+
const cudaGraphNode_t *pDependencies,
|
| 1108 |
+
size_t numDependencies,
|
| 1109 |
+
const T &symbol,
|
| 1110 |
+
const void* src,
|
| 1111 |
+
size_t count,
|
| 1112 |
+
size_t offset,
|
| 1113 |
+
enum cudaMemcpyKind kind)
|
| 1114 |
+
{
|
| 1115 |
+
return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind);
|
| 1116 |
+
}
|
| 1117 |
+
|
| 1118 |
+
/**
|
| 1119 |
+
* \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
|
| 1120 |
+
*
|
| 1121 |
+
* Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
|
| 1122 |
+
* \p numDependencies dependencies specified via \p pDependencies.
|
| 1123 |
+
* It is possible for \p numDependencies to be 0, in which case the node will be placed
|
| 1124 |
+
* at the root of the graph. \p pDependencies may not have any duplicate entries.
|
| 1125 |
+
* A handle to the new node will be returned in \p pGraphNode.
|
| 1126 |
+
*
|
| 1127 |
+
* When the graph is launched, the node will copy \p count bytes from the memory area
|
| 1128 |
+
* pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
|
| 1129 |
+
* pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
|
| 1130 |
+
* that resides in global or constant memory space. \p kind can be either
|
| 1131 |
+
* ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
|
| 1132 |
+
* Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
|
| 1133 |
+
* is inferred from the pointer values. However, ::cudaMemcpyDefault is only
|
| 1134 |
+
* allowed on systems that support unified virtual addressing.
|
| 1135 |
+
*
|
| 1136 |
+
* Memcpy nodes have some additional restrictions with regards to managed memory, if the
|
| 1137 |
+
* system contains at least one device which has a zero value for the device attribute
|
| 1138 |
+
* ::cudaDevAttrConcurrentManagedAccess.
|
| 1139 |
+
*
|
| 1140 |
+
* \param pGraphNode - Returns newly created node
|
| 1141 |
+
* \param graph - Graph to which to add the node
|
| 1142 |
+
* \param pDependencies - Dependencies of the node
|
| 1143 |
+
* \param numDependencies - Number of dependencies
|
| 1144 |
+
* \param dst - Destination memory address
|
| 1145 |
+
* \param symbol - Device symbol address
|
| 1146 |
+
* \param count - Size in bytes to copy
|
| 1147 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1148 |
+
* \param kind - Type of transfer
|
| 1149 |
+
*
|
| 1150 |
+
* \return
|
| 1151 |
+
* ::cudaSuccess,
|
| 1152 |
+
* ::cudaErrorInvalidValue
|
| 1153 |
+
* \note_graph_thread_safety
|
| 1154 |
+
* \notefnerr
|
| 1155 |
+
* \note_init_rt
|
| 1156 |
+
* \note_callback
|
| 1157 |
+
*
|
| 1158 |
+
* \sa
|
| 1159 |
+
* ::cudaMemcpyFromSymbol,
|
| 1160 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1161 |
+
* ::cudaGraphAddMemcpyNodeToSymbol,
|
| 1162 |
+
* ::cudaGraphMemcpyNodeGetParams,
|
| 1163 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1164 |
+
* ::cudaGraphMemcpyNodeSetParamsFromSymbol,
|
| 1165 |
+
* ::cudaGraphMemcpyNodeSetParamsToSymbol,
|
| 1166 |
+
* ::cudaGraphCreate,
|
| 1167 |
+
* ::cudaGraphDestroyNode,
|
| 1168 |
+
* ::cudaGraphAddChildGraphNode,
|
| 1169 |
+
* ::cudaGraphAddEmptyNode,
|
| 1170 |
+
* ::cudaGraphAddKernelNode,
|
| 1171 |
+
* ::cudaGraphAddHostNode,
|
| 1172 |
+
* ::cudaGraphAddMemsetNode
|
| 1173 |
+
*/
|
| 1174 |
+
template<class T>
|
| 1175 |
+
static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol(
|
| 1176 |
+
cudaGraphNode_t* pGraphNode,
|
| 1177 |
+
cudaGraph_t graph,
|
| 1178 |
+
const cudaGraphNode_t* pDependencies,
|
| 1179 |
+
size_t numDependencies,
|
| 1180 |
+
void* dst,
|
| 1181 |
+
const T &symbol,
|
| 1182 |
+
size_t count,
|
| 1183 |
+
size_t offset,
|
| 1184 |
+
enum cudaMemcpyKind kind)
|
| 1185 |
+
{
|
| 1186 |
+
return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind);
|
| 1187 |
+
}
|
| 1188 |
+
|
| 1189 |
+
/**
|
| 1190 |
+
* \brief Sets a memcpy node's parameters to copy to a symbol on the device
|
| 1191 |
+
*
|
| 1192 |
+
* Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
|
| 1193 |
+
*
|
| 1194 |
+
* When the graph is launched, the node will copy \p count bytes from the memory area
|
| 1195 |
+
* pointed to by \p src to the memory area pointed to by \p offset bytes from the start
|
| 1196 |
+
* of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
|
| 1197 |
+
* resides in global or constant memory space. \p kind can be either
|
| 1198 |
+
* ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
|
| 1199 |
+
* Passing ::cudaMemcpyDefault is recommended, in which case the type of
|
| 1200 |
+
* transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
|
| 1201 |
+
* is only allowed on systems that support unified virtual addressing.
|
| 1202 |
+
*
|
| 1203 |
+
* \param node - Node to set the parameters for
|
| 1204 |
+
* \param symbol - Device symbol address
|
| 1205 |
+
* \param src - Source memory address
|
| 1206 |
+
* \param count - Size in bytes to copy
|
| 1207 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1208 |
+
* \param kind - Type of transfer
|
| 1209 |
+
*
|
| 1210 |
+
* \return
|
| 1211 |
+
* ::cudaSuccess,
|
| 1212 |
+
* ::cudaErrorInvalidValue
|
| 1213 |
+
* \note_graph_thread_safety
|
| 1214 |
+
* \notefnerr
|
| 1215 |
+
* \note_init_rt
|
| 1216 |
+
* \note_callback
|
| 1217 |
+
*
|
| 1218 |
+
* \sa
|
| 1219 |
+
* ::cudaMemcpyToSymbol,
|
| 1220 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1221 |
+
* ::cudaGraphMemcpyNodeSetParamsFromSymbol,
|
| 1222 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1223 |
+
* ::cudaGraphMemcpyNodeGetParams
|
| 1224 |
+
*/
|
| 1225 |
+
template<class T>
|
| 1226 |
+
static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(
|
| 1227 |
+
cudaGraphNode_t node,
|
| 1228 |
+
const T &symbol,
|
| 1229 |
+
const void* src,
|
| 1230 |
+
size_t count,
|
| 1231 |
+
size_t offset,
|
| 1232 |
+
enum cudaMemcpyKind kind)
|
| 1233 |
+
{
|
| 1234 |
+
return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind);
|
| 1235 |
+
}
|
| 1236 |
+
|
| 1237 |
+
/**
|
| 1238 |
+
* \brief Sets a memcpy node's parameters to copy from a symbol on the device
|
| 1239 |
+
*
|
| 1240 |
+
* Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
|
| 1241 |
+
*
|
| 1242 |
+
* When the graph is launched, the node will copy \p count bytes from the memory area
|
| 1243 |
+
* pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
|
| 1244 |
+
* pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
|
| 1245 |
+
* that resides in global or constant memory space. \p kind can be either
|
| 1246 |
+
* ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
|
| 1247 |
+
* Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
|
| 1248 |
+
* is inferred from the pointer values. However, ::cudaMemcpyDefault is only
|
| 1249 |
+
* allowed on systems that support unified virtual addressing.
|
| 1250 |
+
*
|
| 1251 |
+
* \param node - Node to set the parameters for
|
| 1252 |
+
* \param dst - Destination memory address
|
| 1253 |
+
* \param symbol - Device symbol address
|
| 1254 |
+
* \param count - Size in bytes to copy
|
| 1255 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1256 |
+
* \param kind - Type of transfer
|
| 1257 |
+
*
|
| 1258 |
+
* \return
|
| 1259 |
+
* ::cudaSuccess,
|
| 1260 |
+
* ::cudaErrorInvalidValue
|
| 1261 |
+
* \note_graph_thread_safety
|
| 1262 |
+
* \notefnerr
|
| 1263 |
+
* \note_init_rt
|
| 1264 |
+
* \note_callback
|
| 1265 |
+
*
|
| 1266 |
+
* \sa
|
| 1267 |
+
* ::cudaMemcpyFromSymbol,
|
| 1268 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1269 |
+
* ::cudaGraphMemcpyNodeSetParamsToSymbol,
|
| 1270 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1271 |
+
* ::cudaGraphMemcpyNodeGetParams
|
| 1272 |
+
*/
|
| 1273 |
+
template<class T>
|
| 1274 |
+
static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol(
|
| 1275 |
+
cudaGraphNode_t node,
|
| 1276 |
+
void* dst,
|
| 1277 |
+
const T &symbol,
|
| 1278 |
+
size_t count,
|
| 1279 |
+
size_t offset,
|
| 1280 |
+
enum cudaMemcpyKind kind)
|
| 1281 |
+
{
|
| 1282 |
+
return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind);
|
| 1283 |
+
}
|
| 1284 |
+
|
| 1285 |
+
/**
|
| 1286 |
+
* \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
|
| 1287 |
+
*
|
| 1288 |
+
* Updates the work represented by \p node in \p hGraphExec as though \p node had
|
| 1289 |
+
* contained the given params at instantiation. \p node must remain in the graph which was
|
| 1290 |
+
* used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored.
|
| 1291 |
+
*
|
| 1292 |
+
* \p src and \p symbol must be allocated from the same contexts as the original source and
|
| 1293 |
+
* destination memory. The instantiation-time memory operands must be 1-dimensional.
|
| 1294 |
+
* Zero-length operations are not supported.
|
| 1295 |
+
*
|
| 1296 |
+
* The modifications only affect future launches of \p hGraphExec. Already enqueued
|
| 1297 |
+
* or running launches of \p hGraphExec are not affected by this call. \p node is also
|
| 1298 |
+
* not modified by this call.
|
| 1299 |
+
*
|
| 1300 |
+
* Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
|
| 1301 |
+
* the original memory operands are multidimensional.
|
| 1302 |
+
*
|
| 1303 |
+
* \param hGraphExec - The executable graph in which to set the specified node
|
| 1304 |
+
* \param node - Memcpy node from the graph which was used to instantiate graphExec
|
| 1305 |
+
* \param symbol - Device symbol address
|
| 1306 |
+
* \param src - Source memory address
|
| 1307 |
+
* \param count - Size in bytes to copy
|
| 1308 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1309 |
+
* \param kind - Type of transfer
|
| 1310 |
+
*
|
| 1311 |
+
* \return
|
| 1312 |
+
* ::cudaSuccess,
|
| 1313 |
+
* ::cudaErrorInvalidValue
|
| 1314 |
+
* \note_graph_thread_safety
|
| 1315 |
+
* \notefnerr
|
| 1316 |
+
* \note_init_rt
|
| 1317 |
+
* \note_callback
|
| 1318 |
+
*
|
| 1319 |
+
* \sa
|
| 1320 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1321 |
+
* ::cudaGraphAddMemcpyNodeToSymbol,
|
| 1322 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1323 |
+
* ::cudaGraphMemcpyNodeSetParamsToSymbol,
|
| 1324 |
+
* ::cudaGraphInstantiate,
|
| 1325 |
+
* ::cudaGraphExecMemcpyNodeSetParams,
|
| 1326 |
+
* ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
|
| 1327 |
+
* ::cudaGraphExecKernelNodeSetParams,
|
| 1328 |
+
* ::cudaGraphExecMemsetNodeSetParams,
|
| 1329 |
+
* ::cudaGraphExecHostNodeSetParams
|
| 1330 |
+
*/
|
| 1331 |
+
template<class T>
|
| 1332 |
+
static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(
|
| 1333 |
+
cudaGraphExec_t hGraphExec,
|
| 1334 |
+
cudaGraphNode_t node,
|
| 1335 |
+
const T &symbol,
|
| 1336 |
+
const void* src,
|
| 1337 |
+
size_t count,
|
| 1338 |
+
size_t offset,
|
| 1339 |
+
enum cudaMemcpyKind kind)
|
| 1340 |
+
{
|
| 1341 |
+
return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind);
|
| 1342 |
+
}
|
| 1343 |
+
|
| 1344 |
+
/**
|
| 1345 |
+
* \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
|
| 1346 |
+
*
|
| 1347 |
+
* Updates the work represented by \p node in \p hGraphExec as though \p node had
|
| 1348 |
+
* contained the given params at instantiation. \p node must remain in the graph which was
|
| 1349 |
+
* used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored.
|
| 1350 |
+
*
|
| 1351 |
+
* \p symbol and \p dst must be allocated from the same contexts as the original source and
|
| 1352 |
+
* destination memory. The instantiation-time memory operands must be 1-dimensional.
|
| 1353 |
+
* Zero-length operations are not supported.
|
| 1354 |
+
*
|
| 1355 |
+
* The modifications only affect future launches of \p hGraphExec. Already enqueued
|
| 1356 |
+
* or running launches of \p hGraphExec are not affected by this call. \p node is also
|
| 1357 |
+
* not modified by this call.
|
| 1358 |
+
*
|
| 1359 |
+
* Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
|
| 1360 |
+
* the original memory operands are multidimensional.
|
| 1361 |
+
*
|
| 1362 |
+
* \param hGraphExec - The executable graph in which to set the specified node
|
| 1363 |
+
* \param node - Memcpy node from the graph which was used to instantiate graphExec
|
| 1364 |
+
* \param dst - Destination memory address
|
| 1365 |
+
* \param symbol - Device symbol address
|
| 1366 |
+
* \param count - Size in bytes to copy
|
| 1367 |
+
* \param offset - Offset from start of symbol in bytes
|
| 1368 |
+
* \param kind - Type of transfer
|
| 1369 |
+
*
|
| 1370 |
+
* \return
|
| 1371 |
+
* ::cudaSuccess,
|
| 1372 |
+
* ::cudaErrorInvalidValue
|
| 1373 |
+
* \note_graph_thread_safety
|
| 1374 |
+
* \notefnerr
|
| 1375 |
+
* \note_init_rt
|
| 1376 |
+
* \note_callback
|
| 1377 |
+
*
|
| 1378 |
+
* \sa
|
| 1379 |
+
* ::cudaGraphAddMemcpyNode,
|
| 1380 |
+
* ::cudaGraphAddMemcpyNodeFromSymbol,
|
| 1381 |
+
* ::cudaGraphMemcpyNodeSetParams,
|
| 1382 |
+
* ::cudaGraphMemcpyNodeSetParamsFromSymbol,
|
| 1383 |
+
* ::cudaGraphInstantiate,
|
| 1384 |
+
* ::cudaGraphExecMemcpyNodeSetParams,
|
| 1385 |
+
* ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
|
| 1386 |
+
* ::cudaGraphExecKernelNodeSetParams,
|
| 1387 |
+
* ::cudaGraphExecMemsetNodeSetParams,
|
| 1388 |
+
* ::cudaGraphExecHostNodeSetParams
|
| 1389 |
+
*/
|
| 1390 |
+
template<class T>
|
| 1391 |
+
static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol(
|
| 1392 |
+
cudaGraphExec_t hGraphExec,
|
| 1393 |
+
cudaGraphNode_t node,
|
| 1394 |
+
void* dst,
|
| 1395 |
+
const T &symbol,
|
| 1396 |
+
size_t count,
|
| 1397 |
+
size_t offset,
|
| 1398 |
+
enum cudaMemcpyKind kind)
|
| 1399 |
+
{
|
| 1400 |
+
return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind);
|
| 1401 |
+
}
|
| 1402 |
+
|
| 1403 |
+
// convenience function to avoid source breakage in c++ code
|
| 1404 |
+
static __inline__ __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out)
|
| 1405 |
+
{
|
| 1406 |
+
cudaGraphExecUpdateResultInfo resultInfo;
|
| 1407 |
+
cudaError_t status = cudaGraphExecUpdate(hGraphExec, hGraph, &resultInfo);
|
| 1408 |
+
if (hErrorNode_out) {
|
| 1409 |
+
*hErrorNode_out = resultInfo.errorNode;
|
| 1410 |
+
}
|
| 1411 |
+
if (updateResult_out) {
|
| 1412 |
+
*updateResult_out = resultInfo.result;
|
| 1413 |
+
}
|
| 1414 |
+
return status;
|
| 1415 |
+
}
|
| 1416 |
+
|
| 1417 |
+
#if __cplusplus >= 201103L || (defined(_MSC_VER) && (_MSC_VER >= 1900))
|
| 1418 |
+
|
| 1419 |
+
/**
|
| 1420 |
+
* \brief Creates a user object by wrapping a C++ object
|
| 1421 |
+
*
|
| 1422 |
+
* TODO detail
|
| 1423 |
+
*
|
| 1424 |
+
* \param object_out - Location to return the user object handle
|
| 1425 |
+
* \param objectToWrap - This becomes the \ptr argument to ::cudaUserObjectCreate. A
|
| 1426 |
+
* lambda will be passed for the \p destroy argument, which calls
|
| 1427 |
+
* delete on this object pointer.
|
| 1428 |
+
* \param initialRefcount - The initial refcount to create the object with, typically 1. The
|
| 1429 |
+
* initial references are owned by the calling thread.
|
| 1430 |
+
* \param flags - Currently it is required to pass cudaUserObjectNoDestructorSync,
|
| 1431 |
+
* which is the only defined flag. This indicates that the destroy
|
| 1432 |
+
* callback cannot be waited on by any CUDA API. Users requiring
|
| 1433 |
+
* synchronization of the callback should signal its completion
|
| 1434 |
+
* manually.
|
| 1435 |
+
*
|
| 1436 |
+
* \return
|
| 1437 |
+
* ::cudaSuccess,
|
| 1438 |
+
* ::cudaErrorInvalidValue
|
| 1439 |
+
*
|
| 1440 |
+
* \sa
|
| 1441 |
+
* ::cudaUserObjectCreate
|
| 1442 |
+
*/
|
| 1443 |
+
template<class T>
|
| 1444 |
+
static __inline__ __host__ cudaError_t cudaUserObjectCreate(
|
| 1445 |
+
cudaUserObject_t *object_out,
|
| 1446 |
+
T *objectToWrap,
|
| 1447 |
+
unsigned int initialRefcount,
|
| 1448 |
+
unsigned int flags)
|
| 1449 |
+
{
|
| 1450 |
+
return ::cudaUserObjectCreate(
|
| 1451 |
+
object_out,
|
| 1452 |
+
objectToWrap,
|
| 1453 |
+
[](void *vpObj) { delete reinterpret_cast<T *>(vpObj); },
|
| 1454 |
+
initialRefcount,
|
| 1455 |
+
flags);
|
| 1456 |
+
}
|
| 1457 |
+
|
| 1458 |
+
template<class T>
|
| 1459 |
+
static __inline__ __host__ cudaError_t cudaUserObjectCreate(
|
| 1460 |
+
cudaUserObject_t *object_out,
|
| 1461 |
+
T *objectToWrap,
|
| 1462 |
+
unsigned int initialRefcount,
|
| 1463 |
+
cudaUserObjectFlags flags)
|
| 1464 |
+
{
|
| 1465 |
+
return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags);
|
| 1466 |
+
}
|
| 1467 |
+
|
| 1468 |
+
#endif
|
| 1469 |
+
|
| 1470 |
+
/**
|
| 1471 |
+
* \brief \hl Finds the address associated with a CUDA symbol
|
| 1472 |
+
*
|
| 1473 |
+
* Returns in \p *devPtr the address of symbol \p symbol on the device.
|
| 1474 |
+
* \p symbol can either be a variable that resides in global or constant memory space.
|
| 1475 |
+
* If \p symbol cannot be found, or if \p symbol is not declared
|
| 1476 |
+
* in the global or constant memory space, \p *devPtr is unchanged and the error
|
| 1477 |
+
* ::cudaErrorInvalidSymbol is returned.
|
| 1478 |
+
*
|
| 1479 |
+
* \param devPtr - Return device pointer associated with symbol
|
| 1480 |
+
* \param symbol - Device symbol reference
|
| 1481 |
+
*
|
| 1482 |
+
* \return
|
| 1483 |
+
* ::cudaSuccess,
|
| 1484 |
+
* ::cudaErrorInvalidSymbol,
|
| 1485 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 1486 |
+
* \notefnerr
|
| 1487 |
+
* \note_init_rt
|
| 1488 |
+
* \note_callback
|
| 1489 |
+
*
|
| 1490 |
+
* \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
|
| 1491 |
+
* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
|
| 1492 |
+
*/
|
| 1493 |
+
template<class T>
|
| 1494 |
+
static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
|
| 1495 |
+
void **devPtr,
|
| 1496 |
+
const T &symbol
|
| 1497 |
+
)
|
| 1498 |
+
{
|
| 1499 |
+
return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
|
| 1500 |
+
}
|
| 1501 |
+
|
| 1502 |
+
/**
|
| 1503 |
+
* \brief \hl Finds the size of the object associated with a CUDA symbol
|
| 1504 |
+
*
|
| 1505 |
+
* Returns in \p *size the size of symbol \p symbol. \p symbol must be a
|
| 1506 |
+
* variable that resides in global or constant memory space.
|
| 1507 |
+
* If \p symbol cannot be found, or if \p symbol is not declared
|
| 1508 |
+
* in global or constant memory space, \p *size is unchanged and the error
|
| 1509 |
+
* ::cudaErrorInvalidSymbol is returned.
|
| 1510 |
+
*
|
| 1511 |
+
* \param size - Size of object associated with symbol
|
| 1512 |
+
* \param symbol - Device symbol reference
|
| 1513 |
+
*
|
| 1514 |
+
* \return
|
| 1515 |
+
* ::cudaSuccess,
|
| 1516 |
+
* ::cudaErrorInvalidSymbol,
|
| 1517 |
+
* ::cudaErrorNoKernelImageForDevice
|
| 1518 |
+
* \notefnerr
|
| 1519 |
+
* \note_init_rt
|
| 1520 |
+
* \note_callback
|
| 1521 |
+
*
|
| 1522 |
+
* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
|
| 1523 |
+
* \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
|
| 1524 |
+
*/
|
| 1525 |
+
template<class T>
|
| 1526 |
+
static __inline__ __host__ cudaError_t cudaGetSymbolSize(
|
| 1527 |
+
size_t *size,
|
| 1528 |
+
const T &symbol
|
| 1529 |
+
)
|
| 1530 |
+
{
|
| 1531 |
+
return ::cudaGetSymbolSize(size, (const void*)&symbol);
|
| 1532 |
+
}
|
| 1533 |
+
|
| 1534 |
+
/**
|
| 1535 |
+
* \brief \hl Sets the preferred cache configuration for a device function
|
| 1536 |
+
*
|
| 1537 |
+
* On devices where the L1 cache and shared memory use the same hardware
|
| 1538 |
+
* resources, this sets through \p cacheConfig the preferred cache configuration
|
| 1539 |
+
* for the function specified via \p func. This is only a preference. The
|
| 1540 |
+
* runtime will use the requested configuration if possible, but it is free to
|
| 1541 |
+
* choose a different configuration if required to execute \p func.
|
| 1542 |
+
*
|
| 1543 |
+
* \p func must be a pointer to a function that executes on the device.
|
| 1544 |
+
* The parameter specified by \p func must be declared as a \p __global__
|
| 1545 |
+
* function. If the specified function does not exist,
|
| 1546 |
+
* then ::cudaErrorInvalidDeviceFunction is returned.
|
| 1547 |
+
*
|
| 1548 |
+
* This setting does nothing on devices where the size of the L1 cache and
|
| 1549 |
+
* shared memory are fixed.
|
| 1550 |
+
*
|
| 1551 |
+
* Launching a kernel with a different preference than the most recent
|
| 1552 |
+
* preference setting may insert a device-side synchronization point.
|
| 1553 |
+
*
|
| 1554 |
+
* The supported cache configurations are:
|
| 1555 |
+
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
|
| 1556 |
+
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
|
| 1557 |
+
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
|
| 1558 |
+
*
|
| 1559 |
+
* \param func - device function pointer
|
| 1560 |
+
* \param cacheConfig - Requested cache configuration
|
| 1561 |
+
*
|
| 1562 |
+
* \return
|
| 1563 |
+
* ::cudaSuccess,
|
| 1564 |
+
* ::cudaErrorInvalidDeviceFunction
|
| 1565 |
+
* \notefnerr
|
| 1566 |
+
* \note_init_rt
|
| 1567 |
+
* \note_callback
|
| 1568 |
+
*
|
| 1569 |
+
* \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
|
| 1570 |
+
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
|
| 1571 |
+
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
|
| 1572 |
+
* ::cudaSetDoubleForDevice,
|
| 1573 |
+
* ::cudaSetDoubleForHost,
|
| 1574 |
+
* ::cudaThreadGetCacheConfig,
|
| 1575 |
+
* ::cudaThreadSetCacheConfig
|
| 1576 |
+
*/
|
| 1577 |
+
template<class T>
|
| 1578 |
+
static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
|
| 1579 |
+
T *func,
|
| 1580 |
+
enum cudaFuncCache cacheConfig
|
| 1581 |
+
)
|
| 1582 |
+
{
|
| 1583 |
+
return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
|
| 1584 |
+
}
|
| 1585 |
+
|
| 1586 |
+
template<class T>
|
| 1587 |
+
static __inline__
|
| 1588 |
+
__CUDA_DEPRECATED
|
| 1589 |
+
__host__ cudaError_t cudaFuncSetSharedMemConfig(
|
| 1590 |
+
T *func,
|
| 1591 |
+
enum cudaSharedMemConfig config
|
| 1592 |
+
)
|
| 1593 |
+
{
|
| 1594 |
+
#if defined(__GNUC__)
|
| 1595 |
+
#pragma GCC diagnostic push
|
| 1596 |
+
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
| 1597 |
+
#elif defined(_MSC_VER)
|
| 1598 |
+
#pragma warning(suppress: 4996)
|
| 1599 |
+
#endif
|
| 1600 |
+
return ::cudaFuncSetSharedMemConfig((const void*)func, config);
|
| 1601 |
+
#if defined(__GNUC__)
|
| 1602 |
+
#pragma GCC diagnostic pop
|
| 1603 |
+
#endif
|
| 1604 |
+
}
|
| 1605 |
+
|
| 1606 |
+
#endif // __CUDACC__
|
| 1607 |
+
|
| 1608 |
+
/**
|
| 1609 |
+
* \brief Returns occupancy for a device function
|
| 1610 |
+
*
|
| 1611 |
+
* Returns in \p *numBlocks the maximum number of active blocks per
|
| 1612 |
+
* streaming multiprocessor for the device function.
|
| 1613 |
+
*
|
| 1614 |
+
* \param numBlocks - Returned occupancy
|
| 1615 |
+
* \param func - Kernel function for which occupancy is calulated
|
| 1616 |
+
* \param blockSize - Block size the kernel is intended to be launched with
|
| 1617 |
+
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
|
| 1618 |
+
*
|
| 1619 |
+
* \return
|
| 1620 |
+
* ::cudaSuccess,
|
| 1621 |
+
* ::cudaErrorInvalidDevice,
|
| 1622 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 1623 |
+
* ::cudaErrorInvalidValue,
|
| 1624 |
+
* ::cudaErrorUnknown,
|
| 1625 |
+
* \notefnerr
|
| 1626 |
+
* \note_init_rt
|
| 1627 |
+
* \note_callback
|
| 1628 |
+
*
|
| 1629 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 1630 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 1631 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 1632 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 1633 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 1634 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 1635 |
+
*/
|
| 1636 |
+
template<class T>
|
| 1637 |
+
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
| 1638 |
+
int *numBlocks,
|
| 1639 |
+
T func,
|
| 1640 |
+
int blockSize,
|
| 1641 |
+
size_t dynamicSMemSize)
|
| 1642 |
+
{
|
| 1643 |
+
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
|
| 1644 |
+
}
|
| 1645 |
+
|
| 1646 |
+
/**
|
| 1647 |
+
* \brief Returns occupancy for a device function with the specified flags
|
| 1648 |
+
*
|
| 1649 |
+
* Returns in \p *numBlocks the maximum number of active blocks per
|
| 1650 |
+
* streaming multiprocessor for the device function.
|
| 1651 |
+
*
|
| 1652 |
+
* The \p flags parameter controls how special cases are handled. Valid flags include:
|
| 1653 |
+
*
|
| 1654 |
+
* - ::cudaOccupancyDefault: keeps the default behavior as
|
| 1655 |
+
* ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 1656 |
+
*
|
| 1657 |
+
* - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior
|
| 1658 |
+
* on platform where global caching affects occupancy. On such platforms, if caching
|
| 1659 |
+
* is enabled, but per-block SM resource usage would result in zero occupancy, the
|
| 1660 |
+
* occupancy calculator will calculate the occupancy as if caching is disabled.
|
| 1661 |
+
* Setting this flag makes the occupancy calculator to return 0 in such cases.
|
| 1662 |
+
* More information can be found about this feature in the "Unified L1/Texture Cache"
|
| 1663 |
+
* section of the Maxwell tuning guide.
|
| 1664 |
+
*
|
| 1665 |
+
* \param numBlocks - Returned occupancy
|
| 1666 |
+
* \param func - Kernel function for which occupancy is calulated
|
| 1667 |
+
* \param blockSize - Block size the kernel is intended to be launched with
|
| 1668 |
+
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
|
| 1669 |
+
* \param flags - Requested behavior for the occupancy calculator
|
| 1670 |
+
*
|
| 1671 |
+
* \return
|
| 1672 |
+
* ::cudaSuccess,
|
| 1673 |
+
* ::cudaErrorInvalidDevice,
|
| 1674 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 1675 |
+
* ::cudaErrorInvalidValue,
|
| 1676 |
+
* ::cudaErrorUnknown,
|
| 1677 |
+
* \notefnerr
|
| 1678 |
+
* \note_init_rt
|
| 1679 |
+
* \note_callback
|
| 1680 |
+
*
|
| 1681 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 1682 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 1683 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 1684 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 1685 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 1686 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 1687 |
+
*/
|
| 1688 |
+
template<class T>
|
| 1689 |
+
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
| 1690 |
+
int *numBlocks,
|
| 1691 |
+
T func,
|
| 1692 |
+
int blockSize,
|
| 1693 |
+
size_t dynamicSMemSize,
|
| 1694 |
+
unsigned int flags)
|
| 1695 |
+
{
|
| 1696 |
+
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags);
|
| 1697 |
+
}
|
| 1698 |
+
|
| 1699 |
+
/**
|
| 1700 |
+
* Helper functor for cudaOccupancyMaxPotentialBlockSize
|
| 1701 |
+
*/
|
| 1702 |
+
class __cudaOccupancyB2DHelper {
|
| 1703 |
+
size_t n;
|
| 1704 |
+
public:
|
| 1705 |
+
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
|
| 1706 |
+
inline __host__ CUDART_DEVICE size_t operator()(int)
|
| 1707 |
+
{
|
| 1708 |
+
return n;
|
| 1709 |
+
}
|
| 1710 |
+
};
|
| 1711 |
+
|
| 1712 |
+
/**
|
| 1713 |
+
* \brief Returns grid and block size that achieves maximum potential occupancy for a device function
|
| 1714 |
+
*
|
| 1715 |
+
* Returns in \p *minGridSize and \p *blocksize a suggested grid /
|
| 1716 |
+
* block size pair that achieves the best potential occupancy
|
| 1717 |
+
* (i.e. the maximum number of active warps with the smallest number
|
| 1718 |
+
* of blocks).
|
| 1719 |
+
*
|
| 1720 |
+
* The \p flags parameter controls how special cases are handled. Valid flags include:
|
| 1721 |
+
*
|
| 1722 |
+
* - ::cudaOccupancyDefault: keeps the default behavior as
|
| 1723 |
+
* ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 1724 |
+
*
|
| 1725 |
+
* - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
|
| 1726 |
+
* on platform where global caching affects occupancy. On such platforms, if caching
|
| 1727 |
+
* is enabled, but per-block SM resource usage would result in zero occupancy, the
|
| 1728 |
+
* occupancy calculator will calculate the occupancy as if caching is disabled.
|
| 1729 |
+
* Setting this flag makes the occupancy calculator to return 0 in such cases.
|
| 1730 |
+
* More information can be found about this feature in the "Unified L1/Texture Cache"
|
| 1731 |
+
* section of the Maxwell tuning guide.
|
| 1732 |
+
*
|
| 1733 |
+
* \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
|
| 1734 |
+
* \param blockSize - Returned block size
|
| 1735 |
+
* \param func - Device function symbol
|
| 1736 |
+
* \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
|
| 1737 |
+
* \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
|
| 1738 |
+
* \param flags - Requested behavior for the occupancy calculator
|
| 1739 |
+
*
|
| 1740 |
+
* \return
|
| 1741 |
+
* ::cudaSuccess,
|
| 1742 |
+
* ::cudaErrorInvalidDevice,
|
| 1743 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 1744 |
+
* ::cudaErrorInvalidValue,
|
| 1745 |
+
* ::cudaErrorUnknown,
|
| 1746 |
+
* \notefnerr
|
| 1747 |
+
* \note_init_rt
|
| 1748 |
+
* \note_callback
|
| 1749 |
+
*
|
| 1750 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 1751 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 1752 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 1753 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 1754 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 1755 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 1756 |
+
*/
|
| 1757 |
+
|
| 1758 |
+
template<typename UnaryFunction, class T>
|
| 1759 |
+
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
|
| 1760 |
+
int *minGridSize,
|
| 1761 |
+
int *blockSize,
|
| 1762 |
+
T func,
|
| 1763 |
+
UnaryFunction blockSizeToDynamicSMemSize,
|
| 1764 |
+
int blockSizeLimit = 0,
|
| 1765 |
+
unsigned int flags = 0)
|
| 1766 |
+
{
|
| 1767 |
+
cudaError_t status;
|
| 1768 |
+
|
| 1769 |
+
// Device and function properties
|
| 1770 |
+
int device;
|
| 1771 |
+
struct cudaFuncAttributes attr;
|
| 1772 |
+
|
| 1773 |
+
// Limits
|
| 1774 |
+
int maxThreadsPerMultiProcessor;
|
| 1775 |
+
int warpSize;
|
| 1776 |
+
int devMaxThreadsPerBlock;
|
| 1777 |
+
int multiProcessorCount;
|
| 1778 |
+
int funcMaxThreadsPerBlock;
|
| 1779 |
+
int occupancyLimit;
|
| 1780 |
+
int granularity;
|
| 1781 |
+
|
| 1782 |
+
// Recorded maximum
|
| 1783 |
+
int maxBlockSize = 0;
|
| 1784 |
+
int numBlocks = 0;
|
| 1785 |
+
int maxOccupancy = 0;
|
| 1786 |
+
|
| 1787 |
+
// Temporary
|
| 1788 |
+
int blockSizeToTryAligned;
|
| 1789 |
+
int blockSizeToTry;
|
| 1790 |
+
int blockSizeLimitAligned;
|
| 1791 |
+
int occupancyInBlocks;
|
| 1792 |
+
int occupancyInThreads;
|
| 1793 |
+
size_t dynamicSMemSize;
|
| 1794 |
+
|
| 1795 |
+
///////////////////////////
|
| 1796 |
+
// Check user input
|
| 1797 |
+
///////////////////////////
|
| 1798 |
+
|
| 1799 |
+
if (!minGridSize || !blockSize || !func) {
|
| 1800 |
+
return cudaErrorInvalidValue;
|
| 1801 |
+
}
|
| 1802 |
+
|
| 1803 |
+
//////////////////////////////////////////////
|
| 1804 |
+
// Obtain device and function properties
|
| 1805 |
+
//////////////////////////////////////////////
|
| 1806 |
+
|
| 1807 |
+
status = ::cudaGetDevice(&device);
|
| 1808 |
+
if (status != cudaSuccess) {
|
| 1809 |
+
return status;
|
| 1810 |
+
}
|
| 1811 |
+
|
| 1812 |
+
status = cudaDeviceGetAttribute(
|
| 1813 |
+
&maxThreadsPerMultiProcessor,
|
| 1814 |
+
cudaDevAttrMaxThreadsPerMultiProcessor,
|
| 1815 |
+
device);
|
| 1816 |
+
if (status != cudaSuccess) {
|
| 1817 |
+
return status;
|
| 1818 |
+
}
|
| 1819 |
+
|
| 1820 |
+
status = cudaDeviceGetAttribute(
|
| 1821 |
+
&warpSize,
|
| 1822 |
+
cudaDevAttrWarpSize,
|
| 1823 |
+
device);
|
| 1824 |
+
if (status != cudaSuccess) {
|
| 1825 |
+
return status;
|
| 1826 |
+
}
|
| 1827 |
+
|
| 1828 |
+
status = cudaDeviceGetAttribute(
|
| 1829 |
+
&devMaxThreadsPerBlock,
|
| 1830 |
+
cudaDevAttrMaxThreadsPerBlock,
|
| 1831 |
+
device);
|
| 1832 |
+
if (status != cudaSuccess) {
|
| 1833 |
+
return status;
|
| 1834 |
+
}
|
| 1835 |
+
|
| 1836 |
+
status = cudaDeviceGetAttribute(
|
| 1837 |
+
&multiProcessorCount,
|
| 1838 |
+
cudaDevAttrMultiProcessorCount,
|
| 1839 |
+
device);
|
| 1840 |
+
if (status != cudaSuccess) {
|
| 1841 |
+
return status;
|
| 1842 |
+
}
|
| 1843 |
+
|
| 1844 |
+
status = cudaFuncGetAttributes(&attr, func);
|
| 1845 |
+
if (status != cudaSuccess) {
|
| 1846 |
+
return status;
|
| 1847 |
+
}
|
| 1848 |
+
|
| 1849 |
+
funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
|
| 1850 |
+
|
| 1851 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1852 |
+
// Try each block size, and pick the block size with maximum occupancy
|
| 1853 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1854 |
+
|
| 1855 |
+
occupancyLimit = maxThreadsPerMultiProcessor;
|
| 1856 |
+
granularity = warpSize;
|
| 1857 |
+
|
| 1858 |
+
if (blockSizeLimit == 0) {
|
| 1859 |
+
blockSizeLimit = devMaxThreadsPerBlock;
|
| 1860 |
+
}
|
| 1861 |
+
|
| 1862 |
+
if (devMaxThreadsPerBlock < blockSizeLimit) {
|
| 1863 |
+
blockSizeLimit = devMaxThreadsPerBlock;
|
| 1864 |
+
}
|
| 1865 |
+
|
| 1866 |
+
if (funcMaxThreadsPerBlock < blockSizeLimit) {
|
| 1867 |
+
blockSizeLimit = funcMaxThreadsPerBlock;
|
| 1868 |
+
}
|
| 1869 |
+
|
| 1870 |
+
blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
|
| 1871 |
+
|
| 1872 |
+
for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
|
| 1873 |
+
// This is needed for the first iteration, because
|
| 1874 |
+
// blockSizeLimitAligned could be greater than blockSizeLimit
|
| 1875 |
+
//
|
| 1876 |
+
if (blockSizeLimit < blockSizeToTryAligned) {
|
| 1877 |
+
blockSizeToTry = blockSizeLimit;
|
| 1878 |
+
} else {
|
| 1879 |
+
blockSizeToTry = blockSizeToTryAligned;
|
| 1880 |
+
}
|
| 1881 |
+
|
| 1882 |
+
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
|
| 1883 |
+
|
| 1884 |
+
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
| 1885 |
+
&occupancyInBlocks,
|
| 1886 |
+
func,
|
| 1887 |
+
blockSizeToTry,
|
| 1888 |
+
dynamicSMemSize,
|
| 1889 |
+
flags);
|
| 1890 |
+
|
| 1891 |
+
if (status != cudaSuccess) {
|
| 1892 |
+
return status;
|
| 1893 |
+
}
|
| 1894 |
+
|
| 1895 |
+
occupancyInThreads = blockSizeToTry * occupancyInBlocks;
|
| 1896 |
+
|
| 1897 |
+
if (occupancyInThreads > maxOccupancy) {
|
| 1898 |
+
maxBlockSize = blockSizeToTry;
|
| 1899 |
+
numBlocks = occupancyInBlocks;
|
| 1900 |
+
maxOccupancy = occupancyInThreads;
|
| 1901 |
+
}
|
| 1902 |
+
|
| 1903 |
+
// Early out if we have reached the maximum
|
| 1904 |
+
//
|
| 1905 |
+
if (occupancyLimit == maxOccupancy) {
|
| 1906 |
+
break;
|
| 1907 |
+
}
|
| 1908 |
+
}
|
| 1909 |
+
|
| 1910 |
+
///////////////////////////
|
| 1911 |
+
// Return best available
|
| 1912 |
+
///////////////////////////
|
| 1913 |
+
|
| 1914 |
+
// Suggested min grid size to achieve a full machine launch
|
| 1915 |
+
//
|
| 1916 |
+
*minGridSize = numBlocks * multiProcessorCount;
|
| 1917 |
+
*blockSize = maxBlockSize;
|
| 1918 |
+
|
| 1919 |
+
return status;
|
| 1920 |
+
}
|
| 1921 |
+
|
| 1922 |
+
/**
|
| 1923 |
+
* \brief Returns grid and block size that achieves maximum potential occupancy for a device function
|
| 1924 |
+
*
|
| 1925 |
+
* Returns in \p *minGridSize and \p *blocksize a suggested grid /
|
| 1926 |
+
* block size pair that achieves the best potential occupancy
|
| 1927 |
+
* (i.e. the maximum number of active warps with the smallest number
|
| 1928 |
+
* of blocks).
|
| 1929 |
+
*
|
| 1930 |
+
* \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
|
| 1931 |
+
* \param blockSize - Returned block size
|
| 1932 |
+
* \param func - Device function symbol
|
| 1933 |
+
* \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
|
| 1934 |
+
* \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
|
| 1935 |
+
*
|
| 1936 |
+
* \return
|
| 1937 |
+
* ::cudaSuccess,
|
| 1938 |
+
* ::cudaErrorInvalidDevice,
|
| 1939 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 1940 |
+
* ::cudaErrorInvalidValue,
|
| 1941 |
+
* ::cudaErrorUnknown,
|
| 1942 |
+
* \notefnerr
|
| 1943 |
+
* \note_init_rt
|
| 1944 |
+
* \note_callback
|
| 1945 |
+
*
|
| 1946 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 1947 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 1948 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 1949 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 1950 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 1951 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 1952 |
+
*/
|
| 1953 |
+
|
| 1954 |
+
template<typename UnaryFunction, class T>
|
| 1955 |
+
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
|
| 1956 |
+
int *minGridSize,
|
| 1957 |
+
int *blockSize,
|
| 1958 |
+
T func,
|
| 1959 |
+
UnaryFunction blockSizeToDynamicSMemSize,
|
| 1960 |
+
int blockSizeLimit = 0)
|
| 1961 |
+
{
|
| 1962 |
+
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault);
|
| 1963 |
+
}
|
| 1964 |
+
|
| 1965 |
+
/**
|
| 1966 |
+
* \brief Returns grid and block size that achieves maximum potential occupancy for a device function
|
| 1967 |
+
*
|
| 1968 |
+
* Returns in \p *minGridSize and \p *blocksize a suggested grid /
|
| 1969 |
+
* block size pair that achieves the best potential occupancy
|
| 1970 |
+
* (i.e. the maximum number of active warps with the smallest number
|
| 1971 |
+
* of blocks).
|
| 1972 |
+
*
|
| 1973 |
+
* Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
|
| 1974 |
+
* amount of per-block dynamic shared memory changes with different
|
| 1975 |
+
* block sizes.
|
| 1976 |
+
*
|
| 1977 |
+
* \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
|
| 1978 |
+
* \param blockSize - Returned block size
|
| 1979 |
+
* \param func - Device function symbol
|
| 1980 |
+
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
|
| 1981 |
+
* \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
|
| 1982 |
+
*
|
| 1983 |
+
* \return
|
| 1984 |
+
* ::cudaSuccess,
|
| 1985 |
+
* ::cudaErrorInvalidDevice,
|
| 1986 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 1987 |
+
* ::cudaErrorInvalidValue,
|
| 1988 |
+
* ::cudaErrorUnknown,
|
| 1989 |
+
* \notefnerr
|
| 1990 |
+
* \note_init_rt
|
| 1991 |
+
* \note_callback
|
| 1992 |
+
*
|
| 1993 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 1994 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 1995 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 1996 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 1997 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 1998 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 1999 |
+
*/
|
| 2000 |
+
template<class T>
|
| 2001 |
+
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
|
| 2002 |
+
int *minGridSize,
|
| 2003 |
+
int *blockSize,
|
| 2004 |
+
T func,
|
| 2005 |
+
size_t dynamicSMemSize = 0,
|
| 2006 |
+
int blockSizeLimit = 0)
|
| 2007 |
+
{
|
| 2008 |
+
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
|
| 2009 |
+
}
|
| 2010 |
+
|
| 2011 |
+
/**
|
| 2012 |
+
* \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
|
| 2013 |
+
*
|
| 2014 |
+
* Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
|
| 2015 |
+
*
|
| 2016 |
+
* \param dynamicSmemSize - Returned maximum dynamic shared memory
|
| 2017 |
+
* \param func - Kernel function for which occupancy is calculated
|
| 2018 |
+
* \param numBlocks - Number of blocks to fit on SM
|
| 2019 |
+
* \param blockSize - Size of the block
|
| 2020 |
+
*
|
| 2021 |
+
* \return
|
| 2022 |
+
* ::cudaSuccess,
|
| 2023 |
+
* ::cudaErrorInvalidDevice,
|
| 2024 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 2025 |
+
* ::cudaErrorInvalidValue,
|
| 2026 |
+
* ::cudaErrorUnknown,
|
| 2027 |
+
* \notefnerr
|
| 2028 |
+
* \note_init_rt
|
| 2029 |
+
* \note_callback
|
| 2030 |
+
*
|
| 2031 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 2032 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
|
| 2033 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 2034 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 2035 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 2036 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 2037 |
+
*/
|
| 2038 |
+
template<class T>
|
| 2039 |
+
static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(
|
| 2040 |
+
size_t *dynamicSmemSize,
|
| 2041 |
+
T func,
|
| 2042 |
+
int numBlocks,
|
| 2043 |
+
int blockSize)
|
| 2044 |
+
{
|
| 2045 |
+
return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize);
|
| 2046 |
+
}
|
| 2047 |
+
|
| 2048 |
+
/**
|
| 2049 |
+
* \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags
|
| 2050 |
+
*
|
| 2051 |
+
* Returns in \p *minGridSize and \p *blocksize a suggested grid /
|
| 2052 |
+
* block size pair that achieves the best potential occupancy
|
| 2053 |
+
* (i.e. the maximum number of active warps with the smallest number
|
| 2054 |
+
* of blocks).
|
| 2055 |
+
*
|
| 2056 |
+
* The \p flags parameter controls how special cases are handle. Valid flags include:
|
| 2057 |
+
*
|
| 2058 |
+
* - ::cudaOccupancyDefault: keeps the default behavior as
|
| 2059 |
+
* ::cudaOccupancyMaxPotentialBlockSize
|
| 2060 |
+
*
|
| 2061 |
+
* - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
|
| 2062 |
+
* on platform where global caching affects occupancy. On such platforms, if caching
|
| 2063 |
+
* is enabled, but per-block SM resource usage would result in zero occupancy, the
|
| 2064 |
+
* occupancy calculator will calculate the occupancy as if caching is disabled.
|
| 2065 |
+
* Setting this flag makes the occupancy calculator to return 0 in such cases.
|
| 2066 |
+
* More information can be found about this feature in the "Unified L1/Texture Cache"
|
| 2067 |
+
* section of the Maxwell tuning guide.
|
| 2068 |
+
*
|
| 2069 |
+
* Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
|
| 2070 |
+
* amount of per-block dynamic shared memory changes with different
|
| 2071 |
+
* block sizes.
|
| 2072 |
+
*
|
| 2073 |
+
* \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
|
| 2074 |
+
* \param blockSize - Returned block size
|
| 2075 |
+
* \param func - Device function symbol
|
| 2076 |
+
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
|
| 2077 |
+
* \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit.
|
| 2078 |
+
* \param flags - Requested behavior for the occupancy calculator
|
| 2079 |
+
*
|
| 2080 |
+
* \return
|
| 2081 |
+
* ::cudaSuccess,
|
| 2082 |
+
* ::cudaErrorInvalidDevice,
|
| 2083 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 2084 |
+
* ::cudaErrorInvalidValue,
|
| 2085 |
+
* ::cudaErrorUnknown,
|
| 2086 |
+
* \notefnerr
|
| 2087 |
+
* \note_init_rt
|
| 2088 |
+
* \note_callback
|
| 2089 |
+
*
|
| 2090 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSize
|
| 2091 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
| 2092 |
+
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
| 2093 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
|
| 2094 |
+
* \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
|
| 2095 |
+
* \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
|
| 2096 |
+
*/
|
| 2097 |
+
template<class T>
|
| 2098 |
+
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags(
|
| 2099 |
+
int *minGridSize,
|
| 2100 |
+
int *blockSize,
|
| 2101 |
+
T func,
|
| 2102 |
+
size_t dynamicSMemSize = 0,
|
| 2103 |
+
int blockSizeLimit = 0,
|
| 2104 |
+
unsigned int flags = 0)
|
| 2105 |
+
{
|
| 2106 |
+
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags);
|
| 2107 |
+
}
|
| 2108 |
+
|
| 2109 |
+
/**
|
| 2110 |
+
* \brief Given the kernel function (\p func) and launch configuration
|
| 2111 |
+
* (\p config), return the maximum cluster size in \p *clusterSize.
|
| 2112 |
+
*
|
| 2113 |
+
* The cluster dimensions in \p config are ignored. If func has a required
|
| 2114 |
+
* cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect
|
| 2115 |
+
* the required cluster size.
|
| 2116 |
+
*
|
| 2117 |
+
* By default this function will always return a value that's portable on
|
| 2118 |
+
* future hardware. A higher value may be returned if the kernel function
|
| 2119 |
+
* allows non-portable cluster sizes.
|
| 2120 |
+
*
|
| 2121 |
+
* This function will respect the compile time launch bounds.
|
| 2122 |
+
*
|
| 2123 |
+
* \param clusterSize - Returned maximum cluster size that can be launched
|
| 2124 |
+
* for the given kernel function and launch configuration
|
| 2125 |
+
* \param func - Kernel function for which maximum cluster
|
| 2126 |
+
* size is calculated
|
| 2127 |
+
* \param config - Launch configuration for the given kernel function
|
| 2128 |
+
*
|
| 2129 |
+
* \return
|
| 2130 |
+
* ::cudaSuccess,
|
| 2131 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 2132 |
+
* ::cudaErrorInvalidValue,
|
| 2133 |
+
* ::cudaErrorUnknown,
|
| 2134 |
+
* \notefnerr
|
| 2135 |
+
* \note_init_rt
|
| 2136 |
+
* \note_callback
|
| 2137 |
+
*
|
| 2138 |
+
* \sa
|
| 2139 |
+
* ::cudaFuncGetAttributes
|
| 2140 |
+
*/
|
| 2141 |
+
template<class T>
|
| 2142 |
+
static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize(
|
| 2143 |
+
int *clusterSize,
|
| 2144 |
+
T *func,
|
| 2145 |
+
const cudaLaunchConfig_t *config)
|
| 2146 |
+
{
|
| 2147 |
+
return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config);
|
| 2148 |
+
}
|
| 2149 |
+
|
| 2150 |
+
/**
|
| 2151 |
+
* \brief Given the kernel function (\p func) and launch configuration
|
| 2152 |
+
* (\p config), return the maximum number of clusters that could co-exist
|
| 2153 |
+
* on the target device in \p *numClusters.
|
| 2154 |
+
*
|
| 2155 |
+
* If the function has required cluster size already set (see
|
| 2156 |
+
* ::cudaFuncGetAttributes), the cluster size from config must either be
|
| 2157 |
+
* unspecified or match the required size.
|
| 2158 |
+
* Without required sizes, the cluster size must be specified in config,
|
| 2159 |
+
* else the function will return an error.
|
| 2160 |
+
*
|
| 2161 |
+
* Note that various attributes of the kernel function may affect occupancy
|
| 2162 |
+
* calculation. Runtime environment may affect how the hardware schedules
|
| 2163 |
+
* the clusters, so the calculated occupancy is not guaranteed to be achievable.
|
| 2164 |
+
*
|
| 2165 |
+
* \param numClusters - Returned maximum number of clusters that
|
| 2166 |
+
* could co-exist on the target device
|
| 2167 |
+
* \param func - Kernel function for which maximum number
|
| 2168 |
+
* of clusters are calculated
|
| 2169 |
+
* \param config - Launch configuration for the given kernel function
|
| 2170 |
+
*
|
| 2171 |
+
* \return
|
| 2172 |
+
* ::cudaSuccess,
|
| 2173 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 2174 |
+
* ::cudaErrorInvalidValue,
|
| 2175 |
+
* ::cudaErrorInvalidClusterSize,
|
| 2176 |
+
* ::cudaErrorUnknown,
|
| 2177 |
+
* \notefnerr
|
| 2178 |
+
* \note_init_rt
|
| 2179 |
+
* \note_callback
|
| 2180 |
+
*
|
| 2181 |
+
* \sa
|
| 2182 |
+
* ::cudaFuncGetAttributes
|
| 2183 |
+
*/
|
| 2184 |
+
template<class T>
|
| 2185 |
+
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters(
|
| 2186 |
+
int *numClusters,
|
| 2187 |
+
T *func,
|
| 2188 |
+
const cudaLaunchConfig_t *config)
|
| 2189 |
+
{
|
| 2190 |
+
return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config);
|
| 2191 |
+
}
|
| 2192 |
+
|
| 2193 |
+
#if defined __CUDACC__
|
| 2194 |
+
|
| 2195 |
+
/**
|
| 2196 |
+
* \brief \hl Find out attributes for a given function
|
| 2197 |
+
*
|
| 2198 |
+
* This function obtains the attributes of a function specified via \p entry.
|
| 2199 |
+
* The parameter \p entry must be a pointer to a function that executes
|
| 2200 |
+
* on the device. The parameter specified by \p entry must be declared as a \p __global__
|
| 2201 |
+
* function. The fetched attributes are placed in \p attr. If the specified
|
| 2202 |
+
* function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
|
| 2203 |
+
*
|
| 2204 |
+
* Note that some function attributes such as
|
| 2205 |
+
* \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
|
| 2206 |
+
* may vary based on the device that is currently being used.
|
| 2207 |
+
*
|
| 2208 |
+
* \param attr - Return pointer to function's attributes
|
| 2209 |
+
* \param entry - Function to get attributes of
|
| 2210 |
+
*
|
| 2211 |
+
* \return
|
| 2212 |
+
* ::cudaSuccess,
|
| 2213 |
+
* ::cudaErrorInvalidDeviceFunction
|
| 2214 |
+
* \notefnerr
|
| 2215 |
+
* \note_init_rt
|
| 2216 |
+
* \note_callback
|
| 2217 |
+
*
|
| 2218 |
+
* \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
|
| 2219 |
+
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
|
| 2220 |
+
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
|
| 2221 |
+
* ::cudaSetDoubleForDevice,
|
| 2222 |
+
* ::cudaSetDoubleForHost
|
| 2223 |
+
*/
|
| 2224 |
+
template<class T>
|
| 2225 |
+
static __inline__ __host__ cudaError_t cudaFuncGetAttributes(
|
| 2226 |
+
struct cudaFuncAttributes *attr,
|
| 2227 |
+
T *entry
|
| 2228 |
+
)
|
| 2229 |
+
{
|
| 2230 |
+
return ::cudaFuncGetAttributes(attr, (const void*)entry);
|
| 2231 |
+
}
|
| 2232 |
+
|
| 2233 |
+
/**
|
| 2234 |
+
* \brief \hl Set attributes for a given function
|
| 2235 |
+
*
|
| 2236 |
+
* This function sets the attributes of a function specified via \p entry.
|
| 2237 |
+
* The parameter \p entry must be a pointer to a function that executes
|
| 2238 |
+
* on the device. The parameter specified by \p entry must be declared as a \p __global__
|
| 2239 |
+
* function. The enumeration defined by \p attr is set to the value defined by \p value.
|
| 2240 |
+
* If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
|
| 2241 |
+
* If the specified attribute cannot be written, or if the value is incorrect,
|
| 2242 |
+
* then ::cudaErrorInvalidValue is returned.
|
| 2243 |
+
*
|
| 2244 |
+
* Valid values for \p attr are:
|
| 2245 |
+
* - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
|
| 2246 |
+
* cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
|
| 2247 |
+
* - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources,
|
| 2248 |
+
* this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
|
| 2249 |
+
* This is only a hint, and the driver can choose a different ratio if required to execute the function.
|
| 2250 |
+
* - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
|
| 2251 |
+
* blocks. The width, height, and depth values must either all be 0 or all be
|
| 2252 |
+
* positive. The validity of the cluster dimensions is checked at launch time.
|
| 2253 |
+
* If the value is set during compile time, it cannot be set at runtime.
|
| 2254 |
+
* Setting it at runtime will return cudaErrorNotPermitted.
|
| 2255 |
+
* - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
|
| 2256 |
+
* blocks. The width, height, and depth values must either all be 0 or all be
|
| 2257 |
+
* positive. The validity of the cluster dimensions is checked at launch time.
|
| 2258 |
+
* If the value is set during compile time, it cannot be set at runtime.
|
| 2259 |
+
* Setting it at runtime will return cudaErrorNotPermitted.
|
| 2260 |
+
* - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
|
| 2261 |
+
* blocks. The width, height, and depth values must either all be 0 or all be
|
| 2262 |
+
* positive. The validity of the cluster dimensions is checked at launch time.
|
| 2263 |
+
* If the value is set during compile time, it cannot be set at runtime.
|
| 2264 |
+
* Setting it at runtime will return cudaErrorNotPermitted.
|
| 2265 |
+
* - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
|
| 2266 |
+
* scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
|
| 2267 |
+
*
|
| 2268 |
+
* \param entry - Function to get attributes of
|
| 2269 |
+
* \param attr - Attribute to set
|
| 2270 |
+
* \param value - Value to set
|
| 2271 |
+
*
|
| 2272 |
+
* \return
|
| 2273 |
+
* ::cudaSuccess,
|
| 2274 |
+
* ::cudaErrorInvalidDeviceFunction,
|
| 2275 |
+
* ::cudaErrorInvalidValue
|
| 2276 |
+
* \notefnerr
|
| 2277 |
+
* \note_init_rt
|
| 2278 |
+
* \note_callback
|
| 2279 |
+
*
|
| 2280 |
+
* \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
|
| 2281 |
+
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
|
| 2282 |
+
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
|
| 2283 |
+
* ::cudaSetDoubleForDevice,
|
| 2284 |
+
* ::cudaSetDoubleForHost
|
| 2285 |
+
*/
|
| 2286 |
+
template<class T>
|
| 2287 |
+
static __inline__ __host__ cudaError_t cudaFuncSetAttribute(
|
| 2288 |
+
T *entry,
|
| 2289 |
+
enum cudaFuncAttribute attr,
|
| 2290 |
+
int value
|
| 2291 |
+
)
|
| 2292 |
+
{
|
| 2293 |
+
return ::cudaFuncSetAttribute((const void*)entry, attr, value);
|
| 2294 |
+
}
|
| 2295 |
+
|
| 2296 |
+
/**
|
| 2297 |
+
* \brief Returns the function name for a device entry function pointer.
|
| 2298 |
+
*
|
| 2299 |
+
* Returns in \p **name the function name associated with the symbol \p func .
|
| 2300 |
+
* The function name is returned as a null-terminated string. This API may
|
| 2301 |
+
* return a mangled name if the function is not declared as having C linkage.
|
| 2302 |
+
* If \p **name is NULL, ::cudaErrorInvalidValue is returned. If \p func is
|
| 2303 |
+
* not a device entry function, ::cudaErrorInvalidDeviceFunction is returned.
|
| 2304 |
+
*
|
| 2305 |
+
* \param name - The returned name of the function
|
| 2306 |
+
* \param func - The function pointer to retrieve name for
|
| 2307 |
+
*
|
| 2308 |
+
* \return
|
| 2309 |
+
* ::cudaSuccess,
|
| 2310 |
+
* ::cudaErrorInvalidValue,
|
| 2311 |
+
* ::cudaErrorInvalidDeviceFunction
|
| 2312 |
+
* \notefnerr
|
| 2313 |
+
* \note_init_rt
|
| 2314 |
+
* \note_callback
|
| 2315 |
+
*
|
| 2316 |
+
* \ref ::cudaFuncGetName(const char **name, const void *func) "cudaFuncGetName (C API)"
|
| 2317 |
+
*/
|
| 2318 |
+
template<class T>
|
| 2319 |
+
static __inline__ __host__ cudaError_t CUDARTAPI cudaFuncGetName(
|
| 2320 |
+
const char **name,
|
| 2321 |
+
const T *func
|
| 2322 |
+
)
|
| 2323 |
+
{
|
| 2324 |
+
return ::cudaFuncGetName(name, (const void *)func);
|
| 2325 |
+
}
|
| 2326 |
+
|
| 2327 |
+
/**
|
| 2328 |
+
* \brief Get pointer to device kernel that matches entry function \p entryFuncAddr
|
| 2329 |
+
*
|
| 2330 |
+
* Returns in \p kernelPtr the device kernel corresponding to the entry function \p entryFuncAddr.
|
| 2331 |
+
*
|
| 2332 |
+
* \param kernelPtr - Returns the device kernel
|
| 2333 |
+
* \param entryFuncAddr - Address of device entry function to search kernel for
|
| 2334 |
+
*
|
| 2335 |
+
* \return
|
| 2336 |
+
* ::cudaSuccess
|
| 2337 |
+
*
|
| 2338 |
+
* \sa
|
| 2339 |
+
* \ref ::cudaGetKernel(cudaKernel_t *kernelPtr, const void *entryFuncAddr) "cudaGetKernel (C API)"
|
| 2340 |
+
*/
|
| 2341 |
+
template<class T>
|
| 2342 |
+
static __inline__ __host__ cudaError_t cudaGetKernel(
|
| 2343 |
+
cudaKernel_t *kernelPtr,
|
| 2344 |
+
const T *entryFuncAddr
|
| 2345 |
+
)
|
| 2346 |
+
{
|
| 2347 |
+
return ::cudaGetKernel(kernelPtr, (const void *)entryFuncAddr);
|
| 2348 |
+
}
|
| 2349 |
+
|
| 2350 |
+
#endif /* __CUDACC__ */
|
| 2351 |
+
|
| 2352 |
+
/** @} */ /* END CUDART_HIGHLEVEL */
|
| 2353 |
+
|
| 2354 |
+
#endif /* __cplusplus && !__CUDACC_RTC__ */
|
| 2355 |
+
|
| 2356 |
+
#if !defined(__CUDACC_RTC__)
|
| 2357 |
+
#if defined(__GNUC__)
|
| 2358 |
+
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
|
| 2359 |
+
#pragma GCC diagnostic pop
|
| 2360 |
+
#endif
|
| 2361 |
+
#elif defined(_MSC_VER)
|
| 2362 |
+
#pragma warning(pop)
|
| 2363 |
+
#endif
|
| 2364 |
+
#endif
|
| 2365 |
+
|
| 2366 |
+
#undef EXCLUDE_FROM_RTC
|
| 2367 |
+
#undef __CUDA_DEPRECATED
|
| 2368 |
+
|
| 2369 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__)
|
| 2370 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 2371 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
|
| 2372 |
+
#endif
|
| 2373 |
+
|
| 2374 |
+
#endif /* !__CUDA_RUNTIME_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_surface_types.h
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_SURFACE_TYPES_H__)
|
| 51 |
+
#define __CUDA_SURFACE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDACC_RTC__)
|
| 62 |
+
#define EXCLUDE_FROM_RTC
|
| 63 |
+
#include "channel_descriptor.h"
|
| 64 |
+
#undef EXCLUDE_FROM_RTC
|
| 65 |
+
#endif /* !__CUDACC_RTC__ */
|
| 66 |
+
#include "cuda_runtime_api.h"
|
| 67 |
+
|
| 68 |
+
/*******************************************************************************
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
*******************************************************************************/
|
| 73 |
+
|
| 74 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 75 |
+
|
| 76 |
+
#endif /* !__CUDA_SURFACE_TYPES_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_TEXTURE_TYPES_H__)
|
| 51 |
+
#define __CUDA_TEXTURE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDACC_RTC__)
|
| 62 |
+
#define EXCLUDE_FROM_RTC
|
| 63 |
+
#include "channel_descriptor.h"
|
| 64 |
+
#undef EXCLUDE_FROM_RTC
|
| 65 |
+
#endif /* !__CUDACC_RTC__ */
|
| 66 |
+
#include "cuda_runtime_api.h"
|
| 67 |
+
|
| 68 |
+
/*******************************************************************************
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
*******************************************************************************/
|
| 73 |
+
|
| 74 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 75 |
+
|
| 76 |
+
#endif /* !__CUDA_TEXTURE_TYPES_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_VDPAU_INTEROP_H__)
|
| 51 |
+
#define __CUDA_VDPAU_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
|
| 55 |
+
#include <vdpau/vdpau.h>
|
| 56 |
+
|
| 57 |
+
#if defined(__cplusplus)
|
| 58 |
+
extern "C" {
|
| 59 |
+
#endif /* __cplusplus */
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* \addtogroup CUDART_VDPAU VDPAU Interoperability
|
| 63 |
+
* This section describes the VDPAU interoperability functions of the CUDA
|
| 64 |
+
* runtime application programming interface.
|
| 65 |
+
*
|
| 66 |
+
* @{
|
| 67 |
+
*/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \brief Gets the CUDA device associated with a VdpDevice.
|
| 71 |
+
*
|
| 72 |
+
* Returns the CUDA device associated with a VdpDevice, if applicable.
|
| 73 |
+
*
|
| 74 |
+
* \param device - Returns the device associated with vdpDevice, or -1 if
|
| 75 |
+
* the device associated with vdpDevice is not a compute device.
|
| 76 |
+
* \param vdpDevice - A VdpDevice handle
|
| 77 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 78 |
+
*
|
| 79 |
+
* \return
|
| 80 |
+
* ::cudaSuccess
|
| 81 |
+
* \notefnerr
|
| 82 |
+
*
|
| 83 |
+
* \sa
|
| 84 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 85 |
+
* ::cuVDPAUGetDevice
|
| 86 |
+
*/
|
| 87 |
+
extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* \brief Sets a CUDA device to use VDPAU interoperability
|
| 91 |
+
*
|
| 92 |
+
* Records \p vdpDevice as the VdpDevice for VDPAU interoperability
|
| 93 |
+
* with the CUDA device \p device and sets \p device as the current
|
| 94 |
+
* device for the calling host thread.
|
| 95 |
+
*
|
| 96 |
+
* This function will immediately initialize the primary context on
|
| 97 |
+
* \p device if needed.
|
| 98 |
+
*
|
| 99 |
+
* If \p device has already been initialized then this call will fail
|
| 100 |
+
* with the error ::cudaErrorSetOnActiveProcess. In this case it is
|
| 101 |
+
* necessary to reset \p device using ::cudaDeviceReset() before
|
| 102 |
+
* VDPAU interoperability on \p device may be enabled.
|
| 103 |
+
*
|
| 104 |
+
* \param device - Device to use for VDPAU interoperability
|
| 105 |
+
* \param vdpDevice - The VdpDevice to interoperate with
|
| 106 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 107 |
+
*
|
| 108 |
+
* \return
|
| 109 |
+
* ::cudaSuccess,
|
| 110 |
+
* ::cudaErrorInvalidDevice,
|
| 111 |
+
* ::cudaErrorSetOnActiveProcess
|
| 112 |
+
* \notefnerr
|
| 113 |
+
*
|
| 114 |
+
* \sa ::cudaGraphicsVDPAURegisterVideoSurface,
|
| 115 |
+
* ::cudaGraphicsVDPAURegisterOutputSurface,
|
| 116 |
+
* ::cudaDeviceReset
|
| 117 |
+
*/
|
| 118 |
+
extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 119 |
+
|
| 120 |
+
/**
|
| 121 |
+
* \brief Register a VdpVideoSurface object
|
| 122 |
+
*
|
| 123 |
+
* Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
|
| 124 |
+
* A handle to the registered object is returned as \p resource.
|
| 125 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 126 |
+
*
|
| 127 |
+
* - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
|
| 128 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 129 |
+
* read from and written to by CUDA. This is the default value.
|
| 130 |
+
* - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
|
| 131 |
+
* will not write to this resource.
|
| 132 |
+
* - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
|
| 133 |
+
* CUDA will not read from this resource and will write over the
|
| 134 |
+
* entire contents of the resource, so none of the data previously
|
| 135 |
+
* stored in the resource will be preserved.
|
| 136 |
+
*
|
| 137 |
+
* \param resource - Pointer to the returned object handle
|
| 138 |
+
* \param vdpSurface - VDPAU object to be registered
|
| 139 |
+
* \param flags - Map flags
|
| 140 |
+
*
|
| 141 |
+
* \return
|
| 142 |
+
* ::cudaSuccess,
|
| 143 |
+
* ::cudaErrorInvalidDevice,
|
| 144 |
+
* ::cudaErrorInvalidValue,
|
| 145 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 146 |
+
* ::cudaErrorUnknown
|
| 147 |
+
* \notefnerr
|
| 148 |
+
*
|
| 149 |
+
* \sa
|
| 150 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 151 |
+
* ::cudaGraphicsUnregisterResource,
|
| 152 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 153 |
+
* ::cuGraphicsVDPAURegisterVideoSurface
|
| 154 |
+
*/
|
| 155 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
|
| 156 |
+
|
| 157 |
+
/**
|
| 158 |
+
* \brief Register a VdpOutputSurface object
|
| 159 |
+
*
|
| 160 |
+
* Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
|
| 161 |
+
* A handle to the registered object is returned as \p resource.
|
| 162 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 163 |
+
*
|
| 164 |
+
* - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
|
| 165 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 166 |
+
* read from and written to by CUDA. This is the default value.
|
| 167 |
+
* - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
|
| 168 |
+
* will not write to this resource.
|
| 169 |
+
* - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
|
| 170 |
+
* CUDA will not read from this resource and will write over the
|
| 171 |
+
* entire contents of the resource, so none of the data previously
|
| 172 |
+
* stored in the resource will be preserved.
|
| 173 |
+
*
|
| 174 |
+
* \param resource - Pointer to the returned object handle
|
| 175 |
+
* \param vdpSurface - VDPAU object to be registered
|
| 176 |
+
* \param flags - Map flags
|
| 177 |
+
*
|
| 178 |
+
* \return
|
| 179 |
+
* ::cudaSuccess,
|
| 180 |
+
* ::cudaErrorInvalidDevice,
|
| 181 |
+
* ::cudaErrorInvalidValue,
|
| 182 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 183 |
+
* ::cudaErrorUnknown
|
| 184 |
+
* \notefnerr
|
| 185 |
+
*
|
| 186 |
+
* \sa
|
| 187 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 188 |
+
* ::cudaGraphicsUnregisterResource,
|
| 189 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 190 |
+
* ::cuGraphicsVDPAURegisterOutputSurface
|
| 191 |
+
*/
|
| 192 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
|
| 193 |
+
|
| 194 |
+
/** @} */ /* END CUDART_VDPAU */
|
| 195 |
+
|
| 196 |
+
#if defined(__cplusplus)
|
| 197 |
+
}
|
| 198 |
+
#endif /* __cplusplus */
|
| 199 |
+
|
| 200 |
+
#endif /* __CUDA_VDPAU_INTEROP_H__ */
|
| 201 |
+
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudart_platform.h
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef __CUDART_PLATFORM_H__
|
| 51 |
+
#define __CUDART_PLATFORM_H__
|
| 52 |
+
|
| 53 |
+
#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
|
| 54 |
+
#define isEglSupported 1
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
#endif
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.h
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__)
|
| 51 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
|
| 54 |
+
#define EXCLUDE_FROM_RTC
|
| 55 |
+
|
| 56 |
+
#if defined(__CUDACC_RTC__)
|
| 57 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 58 |
+
#elif defined(_NVHPC_CUDA)
|
| 59 |
+
# define __DEVICE_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
|
| 60 |
+
#else /* __CUDACC_RTC__ */
|
| 61 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 62 |
+
#endif /* __CUDACC_RTC__ */
|
| 63 |
+
|
| 64 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 65 |
+
|
| 66 |
+
/*******************************************************************************
|
| 67 |
+
* *
|
| 68 |
+
* *
|
| 69 |
+
* *
|
| 70 |
+
*******************************************************************************/
|
| 71 |
+
|
| 72 |
+
#include "cuda_runtime_api.h"
|
| 73 |
+
|
| 74 |
+
/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in PGI CUDA
|
| 75 |
+
* C++ compiler where the macro __CUDA_ARCH__ is not defined. */
|
| 76 |
+
#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
|
| 77 |
+
#define __DEF_IF_HOST { }
|
| 78 |
+
#else /* !__CUDA_ARCH__ */
|
| 79 |
+
#define __DEF_IF_HOST ;
|
| 80 |
+
#endif /* __CUDA_ARCH__ */
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
/*******************************************************************************
|
| 84 |
+
* *
|
| 85 |
+
* *
|
| 86 |
+
* *
|
| 87 |
+
*******************************************************************************/
|
| 88 |
+
|
| 89 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST
|
| 90 |
+
|
| 91 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 92 |
+
|
| 93 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST
|
| 94 |
+
|
| 95 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 96 |
+
|
| 97 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST
|
| 98 |
+
|
| 99 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 100 |
+
|
| 101 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST
|
| 102 |
+
|
| 103 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST
|
| 104 |
+
|
| 105 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 106 |
+
|
| 107 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST
|
| 108 |
+
|
| 109 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 110 |
+
|
| 111 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 112 |
+
|
| 113 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 114 |
+
|
| 115 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST
|
| 116 |
+
|
| 117 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 118 |
+
|
| 119 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST
|
| 120 |
+
|
| 121 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 122 |
+
|
| 123 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST
|
| 124 |
+
|
| 125 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST
|
| 126 |
+
|
| 127 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST
|
| 128 |
+
|
| 129 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST
|
| 130 |
+
|
| 131 |
+
/*******************************************************************************
|
| 132 |
+
* *
|
| 133 |
+
* *
|
| 134 |
+
* *
|
| 135 |
+
*******************************************************************************/
|
| 136 |
+
|
| 137 |
+
#include "cuda_runtime_api.h"
|
| 138 |
+
|
| 139 |
+
#if defined(_WIN32)
|
| 140 |
+
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
|
| 141 |
+
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
|
| 142 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated))
|
| 143 |
+
#else
|
| 144 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
|
| 145 |
+
#endif
|
| 146 |
+
|
| 147 |
+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
| 148 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
|
| 149 |
+
"To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
|
| 150 |
+
#elif defined(_NVHPC_CUDA)
|
| 151 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
|
| 152 |
+
#else
|
| 153 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
|
| 154 |
+
#endif
|
| 155 |
+
|
| 156 |
+
extern "C"
|
| 157 |
+
{
|
| 158 |
+
extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond);
|
| 159 |
+
extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond);
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
/*******************************************************************************
|
| 164 |
+
* *
|
| 165 |
+
* *
|
| 166 |
+
* *
|
| 167 |
+
*******************************************************************************/
|
| 168 |
+
|
| 169 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
|
| 170 |
+
|
| 171 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
|
| 172 |
+
|
| 173 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST
|
| 174 |
+
|
| 175 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST
|
| 176 |
+
|
| 177 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST
|
| 178 |
+
|
| 179 |
+
#undef __DEPRECATED__
|
| 180 |
+
#undef __WSB_DEPRECATION_MESSAGE
|
| 181 |
+
|
| 182 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 183 |
+
|
| 184 |
+
#undef __DEF_IF_HOST
|
| 185 |
+
#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
|
| 186 |
+
|
| 187 |
+
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
|
| 188 |
+
#include "device_atomic_functions.hpp"
|
| 189 |
+
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
|
| 190 |
+
|
| 191 |
+
#undef EXCLUDE_FROM_RTC
|
| 192 |
+
|
| 193 |
+
#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
|
| 51 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
extern "C"
|
| 63 |
+
{
|
| 64 |
+
extern __device__ __device_builtin__ int __iAtomicAdd(int *address, int val);
|
| 65 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
|
| 66 |
+
extern __device__ __device_builtin__ int __iAtomicExch(int *address, int val);
|
| 67 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val);
|
| 68 |
+
extern __device__ __device_builtin__ float __fAtomicExch(float *address, float val);
|
| 69 |
+
extern __device__ __device_builtin__ int __iAtomicMin(int *address, int val);
|
| 70 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val);
|
| 71 |
+
extern __device__ __device_builtin__ int __iAtomicMax(int *address, int val);
|
| 72 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val);
|
| 73 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val);
|
| 74 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val);
|
| 75 |
+
extern __device__ __device_builtin__ int __iAtomicAnd(int *address, int val);
|
| 76 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val);
|
| 77 |
+
extern __device__ __device_builtin__ int __iAtomicOr(int *address, int val);
|
| 78 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val);
|
| 79 |
+
extern __device__ __device_builtin__ int __iAtomicXor(int *address, int val);
|
| 80 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val);
|
| 81 |
+
extern __device__ __device_builtin__ int __iAtomicCAS(int *address, int compare, int val);
|
| 82 |
+
extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
|
| 86 |
+
extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
|
| 87 |
+
extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
/*******************************************************************************
|
| 92 |
+
* *
|
| 93 |
+
* *
|
| 94 |
+
* *
|
| 95 |
+
*******************************************************************************/
|
| 96 |
+
|
| 97 |
+
#include "cuda_runtime_api.h"
|
| 98 |
+
|
| 99 |
+
/*******************************************************************************
|
| 100 |
+
* *
|
| 101 |
+
* *
|
| 102 |
+
* *
|
| 103 |
+
*******************************************************************************/
|
| 104 |
+
|
| 105 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
|
| 106 |
+
{
|
| 107 |
+
return __iAtomicAdd(address, val);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
|
| 111 |
+
{
|
| 112 |
+
return __uAtomicAdd(address, val);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
|
| 116 |
+
{
|
| 117 |
+
return __iAtomicAdd(address, (unsigned int)-(int)val);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
|
| 121 |
+
{
|
| 122 |
+
return __uAtomicAdd(address, (unsigned int)-(int)val);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
|
| 126 |
+
{
|
| 127 |
+
return __iAtomicExch(address, val);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
|
| 131 |
+
{
|
| 132 |
+
return __uAtomicExch(address, val);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
|
| 136 |
+
{
|
| 137 |
+
return __fAtomicExch(address, val);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
|
| 141 |
+
{
|
| 142 |
+
return __iAtomicMin(address, val);
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
|
| 146 |
+
{
|
| 147 |
+
return __uAtomicMin(address, val);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
|
| 151 |
+
{
|
| 152 |
+
return __iAtomicMax(address, val);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
|
| 156 |
+
{
|
| 157 |
+
return __uAtomicMax(address, val);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
|
| 161 |
+
{
|
| 162 |
+
return __uAtomicInc(address, val);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
|
| 166 |
+
{
|
| 167 |
+
return __uAtomicDec(address, val);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
|
| 171 |
+
{
|
| 172 |
+
return __iAtomicAnd(address, val);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
|
| 176 |
+
{
|
| 177 |
+
return __uAtomicAnd(address, val);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
|
| 181 |
+
{
|
| 182 |
+
return __iAtomicOr(address, val);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
|
| 186 |
+
{
|
| 187 |
+
return __uAtomicOr(address, val);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
|
| 191 |
+
{
|
| 192 |
+
return __iAtomicXor(address, val);
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
|
| 196 |
+
{
|
| 197 |
+
return __uAtomicXor(address, val);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
|
| 201 |
+
{
|
| 202 |
+
return __iAtomicCAS(address, compare, val);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
|
| 206 |
+
{
|
| 207 |
+
return __uAtomicCAS(address, compare, val);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
/*******************************************************************************
|
| 211 |
+
* *
|
| 212 |
+
* *
|
| 213 |
+
* *
|
| 214 |
+
*******************************************************************************/
|
| 215 |
+
|
| 216 |
+
#include "cuda_runtime_api.h"
|
| 217 |
+
|
| 218 |
+
/*******************************************************************************
|
| 219 |
+
* *
|
| 220 |
+
* *
|
| 221 |
+
* *
|
| 222 |
+
*******************************************************************************/
|
| 223 |
+
|
| 224 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
|
| 225 |
+
{
|
| 226 |
+
return __ullAtomicAdd(address, val);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
|
| 230 |
+
{
|
| 231 |
+
return __ullAtomicExch(address, val);
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
|
| 235 |
+
{
|
| 236 |
+
return __ullAtomicCAS(address, compare, val);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
|
| 240 |
+
{
|
| 241 |
+
return (bool)__any((int)cond);
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
|
| 245 |
+
{
|
| 246 |
+
return (bool)__all((int)cond);
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 250 |
+
|
| 251 |
+
#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
|
| 252 |
+
|
| 253 |
+
#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
|
| 254 |
+
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/device_double_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/device_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
|
| 51 |
+
#define __DEVICE_LAUNCH_PARAMETERS_H__
|
| 52 |
+
|
| 53 |
+
#include "vector_types.h"
|
| 54 |
+
|
| 55 |
+
#if !defined(__STORAGE__)
|
| 56 |
+
|
| 57 |
+
#if defined(__CUDACC_RTC__)
|
| 58 |
+
#define __STORAGE__ \
|
| 59 |
+
extern const __device__
|
| 60 |
+
#else /* !__CUDACC_RTC__ */
|
| 61 |
+
#define __STORAGE__ \
|
| 62 |
+
extern const
|
| 63 |
+
#endif /* __CUDACC_RTC__ */
|
| 64 |
+
|
| 65 |
+
#endif /* __STORAGE__ */
|
| 66 |
+
|
| 67 |
+
#if defined(__cplusplus)
|
| 68 |
+
extern "C" {
|
| 69 |
+
#endif /* __cplusplus */
|
| 70 |
+
|
| 71 |
+
uint3 __device_builtin__ __STORAGE__ threadIdx;
|
| 72 |
+
uint3 __device_builtin__ __STORAGE__ blockIdx;
|
| 73 |
+
dim3 __device_builtin__ __STORAGE__ blockDim;
|
| 74 |
+
dim3 __device_builtin__ __STORAGE__ gridDim;
|
| 75 |
+
int __device_builtin__ __STORAGE__ warpSize;
|
| 76 |
+
|
| 77 |
+
#undef __STORAGE__
|
| 78 |
+
|
| 79 |
+
#if defined(__cplusplus)
|
| 80 |
+
}
|
| 81 |
+
#endif /* __cplusplus */
|
| 82 |
+
|
| 83 |
+
#if !defined(__cudaGet_threadIdx)
|
| 84 |
+
|
| 85 |
+
#define __cudaGet_threadIdx() \
|
| 86 |
+
threadIdx
|
| 87 |
+
|
| 88 |
+
#endif /* __cudaGet_threadIdx */
|
| 89 |
+
|
| 90 |
+
#if !defined(__cudaGet_blockIdx)
|
| 91 |
+
|
| 92 |
+
#define __cudaGet_blockIdx() \
|
| 93 |
+
blockIdx
|
| 94 |
+
|
| 95 |
+
#endif /* __cudaGet_blockIdx */
|
| 96 |
+
|
| 97 |
+
#if !defined(__cudaGet_blockDim)
|
| 98 |
+
|
| 99 |
+
#define __cudaGet_blockDim() \
|
| 100 |
+
blockDim
|
| 101 |
+
|
| 102 |
+
#endif /* __cudaGet_blockDim */
|
| 103 |
+
|
| 104 |
+
#if !defined(__cudaGet_gridDim)
|
| 105 |
+
|
| 106 |
+
#define __cudaGet_gridDim() \
|
| 107 |
+
gridDim
|
| 108 |
+
|
| 109 |
+
#endif /* __cudaGet_gridDim */
|
| 110 |
+
|
| 111 |
+
#if !defined(__cudaGet_warpSize)
|
| 112 |
+
|
| 113 |
+
#define __cudaGet_warpSize() \
|
| 114 |
+
warpSize
|
| 115 |
+
|
| 116 |
+
#endif /* __cudaGet_warpSize */
|
| 117 |
+
|
| 118 |
+
#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_TYPES_H__)
|
| 51 |
+
#define __DEVICE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 54 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 55 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
|
| 56 |
+
#endif
|
| 57 |
+
|
| 58 |
+
#ifndef __DOXYGEN_ONLY__
|
| 59 |
+
#include "crt/host_defines.h"
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
/*******************************************************************************
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
*******************************************************************************/
|
| 67 |
+
|
| 68 |
+
enum __device_builtin__ cudaRoundMode
|
| 69 |
+
{
|
| 70 |
+
cudaRoundNearest,
|
| 71 |
+
cudaRoundZero,
|
| 72 |
+
cudaRoundPosInf,
|
| 73 |
+
cudaRoundMinInf
|
| 74 |
+
};
|
| 75 |
+
|
| 76 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
|
| 77 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 78 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
|
| 79 |
+
#endif
|
| 80 |
+
|
| 81 |
+
#endif /* !__DEVICE_TYPES_H__ */
|
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DRIVER_FUNCTIONS_H__)
|
| 51 |
+
#define __DRIVER_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
#include "builtin_types.h"
|
| 54 |
+
#include "crt/host_defines.h"
|
| 55 |
+
#include "driver_types.h"
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
* \addtogroup CUDART_MEMORY
|
| 59 |
+
*
|
| 60 |
+
* @{
|
| 61 |
+
*/
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* \brief Returns a cudaPitchedPtr based on input parameters
|
| 65 |
+
*
|
| 66 |
+
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
|
| 67 |
+
* \p p, \p xsz, and \p ysz.
|
| 68 |
+
*
|
| 69 |
+
* \param d - Pointer to allocated memory
|
| 70 |
+
* \param p - Pitch of allocated memory in bytes
|
| 71 |
+
* \param xsz - Logical width of allocation in elements
|
| 72 |
+
* \param ysz - Logical height of allocation in elements
|
| 73 |
+
*
|
| 74 |
+
* \return
|
| 75 |
+
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
|
| 76 |
+
*
|
| 77 |
+
* \sa make_cudaExtent, make_cudaPos
|
| 78 |
+
*/
|
| 79 |
+
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
|
| 80 |
+
{
|
| 81 |
+
struct cudaPitchedPtr s;
|
| 82 |
+
|
| 83 |
+
s.ptr = d;
|
| 84 |
+
s.pitch = p;
|
| 85 |
+
s.xsize = xsz;
|
| 86 |
+
s.ysize = ysz;
|
| 87 |
+
|
| 88 |
+
return s;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
* \brief Returns a cudaPos based on input parameters
|
| 93 |
+
*
|
| 94 |
+
* Returns a ::cudaPos based on the specified input parameters \p x,
|
| 95 |
+
* \p y, and \p z.
|
| 96 |
+
*
|
| 97 |
+
* \param x - X position
|
| 98 |
+
* \param y - Y position
|
| 99 |
+
* \param z - Z position
|
| 100 |
+
*
|
| 101 |
+
* \return
|
| 102 |
+
* ::cudaPos specified by \p x, \p y, and \p z
|
| 103 |
+
*
|
| 104 |
+
* \sa make_cudaExtent, make_cudaPitchedPtr
|
| 105 |
+
*/
|
| 106 |
+
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
|
| 107 |
+
{
|
| 108 |
+
struct cudaPos p;
|
| 109 |
+
|
| 110 |
+
p.x = x;
|
| 111 |
+
p.y = y;
|
| 112 |
+
p.z = z;
|
| 113 |
+
|
| 114 |
+
return p;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
/**
|
| 118 |
+
* \brief Returns a cudaExtent based on input parameters
|
| 119 |
+
*
|
| 120 |
+
* Returns a ::cudaExtent based on the specified input parameters \p w,
|
| 121 |
+
* \p h, and \p d.
|
| 122 |
+
*
|
| 123 |
+
* \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
|
| 124 |
+
* \param h - Height in elements
|
| 125 |
+
* \param d - Depth in elements
|
| 126 |
+
*
|
| 127 |
+
* \return
|
| 128 |
+
* ::cudaExtent specified by \p w, \p h, and \p d
|
| 129 |
+
*
|
| 130 |
+
* \sa make_cudaPitchedPtr, make_cudaPos
|
| 131 |
+
*/
|
| 132 |
+
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
|
| 133 |
+
{
|
| 134 |
+
struct cudaExtent e;
|
| 135 |
+
|
| 136 |
+
e.width = w;
|
| 137 |
+
e.height = h;
|
| 138 |
+
e.depth = d;
|
| 139 |
+
|
| 140 |
+
return e;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
/** @} */ /* END CUDART_MEMORY */
|
| 144 |
+
|
| 145 |
+
#endif /* !__DRIVER_FUNCTIONS_H__ */
|