Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h +891 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h +824 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +758 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h +348 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h +227 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h +350 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h +94 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h +1958 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h +224 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h +109 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp +224 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h +118 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h +145 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h +103 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp +134 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp +527 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h +439 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h +739 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h +78 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h +658 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h +448 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py +70 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py +161 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py +74 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py +499 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc
ADDED
|
Binary file (13.6 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc
ADDED
|
Binary file (46.4 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc
ADDED
|
Binary file (25.4 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc
ADDED
|
Binary file (67.5 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc
ADDED
|
Binary file (6.66 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h
ADDED
|
@@ -0,0 +1,891 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* This is the public header file for the CUBLAS library, defining the API
|
| 52 |
+
*
|
| 53 |
+
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
|
| 54 |
+
* on top of the CUDA runtime.
|
| 55 |
+
*/
|
| 56 |
+
|
| 57 |
+
#if !defined(CUBLAS_H_)
|
| 58 |
+
#define CUBLAS_H_
|
| 59 |
+
|
| 60 |
+
#if defined(CUBLAS_V2_H_)
|
| 61 |
+
#error "It is an error to include both cublas.h and cublas_v2.h"
|
| 62 |
+
#endif
|
| 63 |
+
|
| 64 |
+
#include <cuda_runtime.h>
|
| 65 |
+
|
| 66 |
+
#ifndef CUBLASWINAPI
|
| 67 |
+
#ifdef _WIN32
|
| 68 |
+
#define CUBLASWINAPI __stdcall
|
| 69 |
+
#else
|
| 70 |
+
#define CUBLASWINAPI
|
| 71 |
+
#endif
|
| 72 |
+
#endif
|
| 73 |
+
|
| 74 |
+
#undef CUBLASAPI
|
| 75 |
+
#ifdef __CUDACC__
|
| 76 |
+
#define CUBLASAPI __host__
|
| 77 |
+
#else
|
| 78 |
+
#define CUBLASAPI
|
| 79 |
+
#endif
|
| 80 |
+
|
| 81 |
+
#include "cublas_api.h"
|
| 82 |
+
|
| 83 |
+
#if defined(__cplusplus)
|
| 84 |
+
extern "C" {
|
| 85 |
+
#endif
|
| 86 |
+
|
| 87 |
+
/* CUBLAS data types */
|
| 88 |
+
#define cublasStatus cublasStatus_t
|
| 89 |
+
|
| 90 |
+
cublasStatus CUBLASWINAPI cublasInit(void);
|
| 91 |
+
cublasStatus CUBLASWINAPI cublasShutdown(void);
|
| 92 |
+
cublasStatus CUBLASWINAPI cublasGetError(void);
|
| 93 |
+
|
| 94 |
+
cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
|
| 95 |
+
cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
|
| 96 |
+
|
| 97 |
+
cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
|
| 98 |
+
|
| 99 |
+
cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
|
| 100 |
+
|
| 101 |
+
/* ---------------- CUBLAS BLAS1 functions ---------------- */
|
| 102 |
+
/* NRM2 */
|
| 103 |
+
float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
|
| 104 |
+
double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
|
| 105 |
+
float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
|
| 106 |
+
double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
|
| 107 |
+
/*------------------------------------------------------------------------*/
|
| 108 |
+
/* DOT */
|
| 109 |
+
float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
|
| 110 |
+
double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
|
| 111 |
+
cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
|
| 112 |
+
cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
|
| 113 |
+
cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
|
| 114 |
+
cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
|
| 115 |
+
/*------------------------------------------------------------------------*/
|
| 116 |
+
/* SCAL */
|
| 117 |
+
void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
|
| 118 |
+
void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
|
| 119 |
+
void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
|
| 120 |
+
void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
|
| 121 |
+
|
| 122 |
+
void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
|
| 123 |
+
void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
|
| 124 |
+
/*------------------------------------------------------------------------*/
|
| 125 |
+
/* AXPY */
|
| 126 |
+
void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
|
| 127 |
+
void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
|
| 128 |
+
void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
|
| 129 |
+
void CUBLASWINAPI
|
| 130 |
+
cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 131 |
+
/*------------------------------------------------------------------------*/
|
| 132 |
+
/* COPY */
|
| 133 |
+
void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
|
| 134 |
+
void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
|
| 135 |
+
void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
|
| 136 |
+
void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 137 |
+
/*------------------------------------------------------------------------*/
|
| 138 |
+
/* SWAP */
|
| 139 |
+
void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
|
| 140 |
+
void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
|
| 141 |
+
void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
|
| 142 |
+
void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 143 |
+
/*------------------------------------------------------------------------*/
|
| 144 |
+
/* AMAX */
|
| 145 |
+
int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
|
| 146 |
+
int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
|
| 147 |
+
int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
|
| 148 |
+
int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
|
| 149 |
+
/*------------------------------------------------------------------------*/
|
| 150 |
+
/* AMIN */
|
| 151 |
+
int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
|
| 152 |
+
int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
|
| 153 |
+
|
| 154 |
+
int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
|
| 155 |
+
int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
|
| 156 |
+
/*------------------------------------------------------------------------*/
|
| 157 |
+
/* ASUM */
|
| 158 |
+
float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
|
| 159 |
+
double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
|
| 160 |
+
float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
|
| 161 |
+
double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
|
| 162 |
+
/*------------------------------------------------------------------------*/
|
| 163 |
+
/* ROT */
|
| 164 |
+
void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
|
| 165 |
+
void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
|
| 166 |
+
void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
|
| 167 |
+
void CUBLASWINAPI
|
| 168 |
+
cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
|
| 169 |
+
void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
|
| 170 |
+
void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
|
| 171 |
+
/*------------------------------------------------------------------------*/
|
| 172 |
+
/* ROTG */
|
| 173 |
+
void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
|
| 174 |
+
void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
|
| 175 |
+
void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
|
| 176 |
+
void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
|
| 177 |
+
/*------------------------------------------------------------------------*/
|
| 178 |
+
/* ROTM */
|
| 179 |
+
void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
|
| 180 |
+
void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
|
| 181 |
+
/*------------------------------------------------------------------------*/
|
| 182 |
+
/* ROTMG */
|
| 183 |
+
void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
|
| 184 |
+
void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
|
| 185 |
+
|
| 186 |
+
/* --------------- CUBLAS BLAS2 functions ---------------- */
|
| 187 |
+
/* GEMV */
|
| 188 |
+
void CUBLASWINAPI cublasSgemv(char trans,
|
| 189 |
+
int m,
|
| 190 |
+
int n,
|
| 191 |
+
float alpha,
|
| 192 |
+
const float* A,
|
| 193 |
+
int lda,
|
| 194 |
+
const float* x,
|
| 195 |
+
int incx,
|
| 196 |
+
float beta,
|
| 197 |
+
float* y,
|
| 198 |
+
int incy);
|
| 199 |
+
void CUBLASWINAPI cublasDgemv(char trans,
|
| 200 |
+
int m,
|
| 201 |
+
int n,
|
| 202 |
+
double alpha,
|
| 203 |
+
const double* A,
|
| 204 |
+
int lda,
|
| 205 |
+
const double* x,
|
| 206 |
+
int incx,
|
| 207 |
+
double beta,
|
| 208 |
+
double* y,
|
| 209 |
+
int incy);
|
| 210 |
+
void CUBLASWINAPI cublasCgemv(char trans,
|
| 211 |
+
int m,
|
| 212 |
+
int n,
|
| 213 |
+
cuComplex alpha,
|
| 214 |
+
const cuComplex* A,
|
| 215 |
+
int lda,
|
| 216 |
+
const cuComplex* x,
|
| 217 |
+
int incx,
|
| 218 |
+
cuComplex beta,
|
| 219 |
+
cuComplex* y,
|
| 220 |
+
int incy);
|
| 221 |
+
void CUBLASWINAPI cublasZgemv(char trans,
|
| 222 |
+
int m,
|
| 223 |
+
int n,
|
| 224 |
+
cuDoubleComplex alpha,
|
| 225 |
+
const cuDoubleComplex* A,
|
| 226 |
+
int lda,
|
| 227 |
+
const cuDoubleComplex* x,
|
| 228 |
+
int incx,
|
| 229 |
+
cuDoubleComplex beta,
|
| 230 |
+
cuDoubleComplex* y,
|
| 231 |
+
int incy);
|
| 232 |
+
/*------------------------------------------------------------------------*/
|
| 233 |
+
/* GBMV */
|
| 234 |
+
void CUBLASWINAPI cublasSgbmv(char trans,
|
| 235 |
+
int m,
|
| 236 |
+
int n,
|
| 237 |
+
int kl,
|
| 238 |
+
int ku,
|
| 239 |
+
float alpha,
|
| 240 |
+
const float* A,
|
| 241 |
+
int lda,
|
| 242 |
+
const float* x,
|
| 243 |
+
int incx,
|
| 244 |
+
float beta,
|
| 245 |
+
float* y,
|
| 246 |
+
int incy);
|
| 247 |
+
void CUBLASWINAPI cublasDgbmv(char trans,
|
| 248 |
+
int m,
|
| 249 |
+
int n,
|
| 250 |
+
int kl,
|
| 251 |
+
int ku,
|
| 252 |
+
double alpha,
|
| 253 |
+
const double* A,
|
| 254 |
+
int lda,
|
| 255 |
+
const double* x,
|
| 256 |
+
int incx,
|
| 257 |
+
double beta,
|
| 258 |
+
double* y,
|
| 259 |
+
int incy);
|
| 260 |
+
void CUBLASWINAPI cublasCgbmv(char trans,
|
| 261 |
+
int m,
|
| 262 |
+
int n,
|
| 263 |
+
int kl,
|
| 264 |
+
int ku,
|
| 265 |
+
cuComplex alpha,
|
| 266 |
+
const cuComplex* A,
|
| 267 |
+
int lda,
|
| 268 |
+
const cuComplex* x,
|
| 269 |
+
int incx,
|
| 270 |
+
cuComplex beta,
|
| 271 |
+
cuComplex* y,
|
| 272 |
+
int incy);
|
| 273 |
+
void CUBLASWINAPI cublasZgbmv(char trans,
|
| 274 |
+
int m,
|
| 275 |
+
int n,
|
| 276 |
+
int kl,
|
| 277 |
+
int ku,
|
| 278 |
+
cuDoubleComplex alpha,
|
| 279 |
+
const cuDoubleComplex* A,
|
| 280 |
+
int lda,
|
| 281 |
+
const cuDoubleComplex* x,
|
| 282 |
+
int incx,
|
| 283 |
+
cuDoubleComplex beta,
|
| 284 |
+
cuDoubleComplex* y,
|
| 285 |
+
int incy);
|
| 286 |
+
/*------------------------------------------------------------------------*/
|
| 287 |
+
/* TRMV */
|
| 288 |
+
void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
|
| 289 |
+
void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
|
| 290 |
+
void CUBLASWINAPI
|
| 291 |
+
cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 292 |
+
void CUBLASWINAPI
|
| 293 |
+
cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 294 |
+
/*------------------------------------------------------------------------*/
|
| 295 |
+
/* TBMV */
|
| 296 |
+
void CUBLASWINAPI
|
| 297 |
+
cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
|
| 298 |
+
void CUBLASWINAPI
|
| 299 |
+
cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
|
| 300 |
+
void CUBLASWINAPI
|
| 301 |
+
cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 302 |
+
void CUBLASWINAPI cublasZtbmv(
|
| 303 |
+
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 304 |
+
/*------------------------------------------------------------------------*/
|
| 305 |
+
/* TPMV */
|
| 306 |
+
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
|
| 307 |
+
|
| 308 |
+
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
|
| 309 |
+
|
| 310 |
+
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
|
| 311 |
+
|
| 312 |
+
void CUBLASWINAPI
|
| 313 |
+
cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
|
| 314 |
+
/*------------------------------------------------------------------------*/
|
| 315 |
+
/* TRSV */
|
| 316 |
+
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
|
| 317 |
+
|
| 318 |
+
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
|
| 319 |
+
|
| 320 |
+
void CUBLASWINAPI
|
| 321 |
+
cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 322 |
+
|
| 323 |
+
void CUBLASWINAPI
|
| 324 |
+
cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 325 |
+
/*------------------------------------------------------------------------*/
|
| 326 |
+
/* TPSV */
|
| 327 |
+
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
|
| 328 |
+
|
| 329 |
+
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
|
| 330 |
+
|
| 331 |
+
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
|
| 332 |
+
|
| 333 |
+
void CUBLASWINAPI
|
| 334 |
+
cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
|
| 335 |
+
/*------------------------------------------------------------------------*/
|
| 336 |
+
/* TBSV */
|
| 337 |
+
void CUBLASWINAPI
|
| 338 |
+
cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
|
| 339 |
+
|
| 340 |
+
void CUBLASWINAPI
|
| 341 |
+
cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
|
| 342 |
+
void CUBLASWINAPI
|
| 343 |
+
cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 344 |
+
|
| 345 |
+
void CUBLASWINAPI cublasZtbsv(
|
| 346 |
+
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 347 |
+
/*------------------------------------------------------------------------*/
|
| 348 |
+
/* SYMV/HEMV */
|
| 349 |
+
void CUBLASWINAPI cublasSsymv(
|
| 350 |
+
char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
|
| 351 |
+
void CUBLASWINAPI cublasDsymv(char uplo,
|
| 352 |
+
int n,
|
| 353 |
+
double alpha,
|
| 354 |
+
const double* A,
|
| 355 |
+
int lda,
|
| 356 |
+
const double* x,
|
| 357 |
+
int incx,
|
| 358 |
+
double beta,
|
| 359 |
+
double* y,
|
| 360 |
+
int incy);
|
| 361 |
+
void CUBLASWINAPI cublasChemv(char uplo,
|
| 362 |
+
int n,
|
| 363 |
+
cuComplex alpha,
|
| 364 |
+
const cuComplex* A,
|
| 365 |
+
int lda,
|
| 366 |
+
const cuComplex* x,
|
| 367 |
+
int incx,
|
| 368 |
+
cuComplex beta,
|
| 369 |
+
cuComplex* y,
|
| 370 |
+
int incy);
|
| 371 |
+
void CUBLASWINAPI cublasZhemv(char uplo,
|
| 372 |
+
int n,
|
| 373 |
+
cuDoubleComplex alpha,
|
| 374 |
+
const cuDoubleComplex* A,
|
| 375 |
+
int lda,
|
| 376 |
+
const cuDoubleComplex* x,
|
| 377 |
+
int incx,
|
| 378 |
+
cuDoubleComplex beta,
|
| 379 |
+
cuDoubleComplex* y,
|
| 380 |
+
int incy);
|
| 381 |
+
/*------------------------------------------------------------------------*/
|
| 382 |
+
/* SBMV/HBMV */
|
| 383 |
+
void CUBLASWINAPI cublasSsbmv(char uplo,
|
| 384 |
+
int n,
|
| 385 |
+
int k,
|
| 386 |
+
float alpha,
|
| 387 |
+
const float* A,
|
| 388 |
+
int lda,
|
| 389 |
+
const float* x,
|
| 390 |
+
int incx,
|
| 391 |
+
float beta,
|
| 392 |
+
float* y,
|
| 393 |
+
int incy);
|
| 394 |
+
void CUBLASWINAPI cublasDsbmv(char uplo,
|
| 395 |
+
int n,
|
| 396 |
+
int k,
|
| 397 |
+
double alpha,
|
| 398 |
+
const double* A,
|
| 399 |
+
int lda,
|
| 400 |
+
const double* x,
|
| 401 |
+
int incx,
|
| 402 |
+
double beta,
|
| 403 |
+
double* y,
|
| 404 |
+
int incy);
|
| 405 |
+
void CUBLASWINAPI cublasChbmv(char uplo,
|
| 406 |
+
int n,
|
| 407 |
+
int k,
|
| 408 |
+
cuComplex alpha,
|
| 409 |
+
const cuComplex* A,
|
| 410 |
+
int lda,
|
| 411 |
+
const cuComplex* x,
|
| 412 |
+
int incx,
|
| 413 |
+
cuComplex beta,
|
| 414 |
+
cuComplex* y,
|
| 415 |
+
int incy);
|
| 416 |
+
void CUBLASWINAPI cublasZhbmv(char uplo,
|
| 417 |
+
int n,
|
| 418 |
+
int k,
|
| 419 |
+
cuDoubleComplex alpha,
|
| 420 |
+
const cuDoubleComplex* A,
|
| 421 |
+
int lda,
|
| 422 |
+
const cuDoubleComplex* x,
|
| 423 |
+
int incx,
|
| 424 |
+
cuDoubleComplex beta,
|
| 425 |
+
cuDoubleComplex* y,
|
| 426 |
+
int incy);
|
| 427 |
+
/*------------------------------------------------------------------------*/
|
| 428 |
+
/* SPMV/HPMV */
|
| 429 |
+
void CUBLASWINAPI
|
| 430 |
+
cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
|
| 431 |
+
void CUBLASWINAPI cublasDspmv(
|
| 432 |
+
char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
|
| 433 |
+
void CUBLASWINAPI cublasChpmv(char uplo,
|
| 434 |
+
int n,
|
| 435 |
+
cuComplex alpha,
|
| 436 |
+
const cuComplex* AP,
|
| 437 |
+
const cuComplex* x,
|
| 438 |
+
int incx,
|
| 439 |
+
cuComplex beta,
|
| 440 |
+
cuComplex* y,
|
| 441 |
+
int incy);
|
| 442 |
+
void CUBLASWINAPI cublasZhpmv(char uplo,
|
| 443 |
+
int n,
|
| 444 |
+
cuDoubleComplex alpha,
|
| 445 |
+
const cuDoubleComplex* AP,
|
| 446 |
+
const cuDoubleComplex* x,
|
| 447 |
+
int incx,
|
| 448 |
+
cuDoubleComplex beta,
|
| 449 |
+
cuDoubleComplex* y,
|
| 450 |
+
int incy);
|
| 451 |
+
|
| 452 |
+
/*------------------------------------------------------------------------*/
|
| 453 |
+
/* GER */
|
| 454 |
+
void CUBLASWINAPI
|
| 455 |
+
cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
|
| 456 |
+
void CUBLASWINAPI
|
| 457 |
+
cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
|
| 458 |
+
|
| 459 |
+
void CUBLASWINAPI cublasCgeru(
|
| 460 |
+
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
|
| 461 |
+
void CUBLASWINAPI cublasCgerc(
|
| 462 |
+
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
|
| 463 |
+
void CUBLASWINAPI cublasZgeru(int m,
|
| 464 |
+
int n,
|
| 465 |
+
cuDoubleComplex alpha,
|
| 466 |
+
const cuDoubleComplex* x,
|
| 467 |
+
int incx,
|
| 468 |
+
const cuDoubleComplex* y,
|
| 469 |
+
int incy,
|
| 470 |
+
cuDoubleComplex* A,
|
| 471 |
+
int lda);
|
| 472 |
+
void CUBLASWINAPI cublasZgerc(int m,
|
| 473 |
+
int n,
|
| 474 |
+
cuDoubleComplex alpha,
|
| 475 |
+
const cuDoubleComplex* x,
|
| 476 |
+
int incx,
|
| 477 |
+
const cuDoubleComplex* y,
|
| 478 |
+
int incy,
|
| 479 |
+
cuDoubleComplex* A,
|
| 480 |
+
int lda);
|
| 481 |
+
/*------------------------------------------------------------------------*/
|
| 482 |
+
/* SYR/HER */
|
| 483 |
+
void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
|
| 484 |
+
void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
|
| 485 |
+
|
| 486 |
+
void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
|
| 487 |
+
void CUBLASWINAPI
|
| 488 |
+
cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
|
| 489 |
+
|
| 490 |
+
/*------------------------------------------------------------------------*/
|
| 491 |
+
/* SPR/HPR */
|
| 492 |
+
void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
|
| 493 |
+
void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
|
| 494 |
+
void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
|
| 495 |
+
void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
|
| 496 |
+
/*------------------------------------------------------------------------*/
|
| 497 |
+
/* SYR2/HER2 */
|
| 498 |
+
void CUBLASWINAPI
|
| 499 |
+
cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
|
| 500 |
+
void CUBLASWINAPI
|
| 501 |
+
cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
|
| 502 |
+
void CUBLASWINAPI cublasCher2(char uplo,
|
| 503 |
+
int n,
|
| 504 |
+
cuComplex alpha,
|
| 505 |
+
const cuComplex* x,
|
| 506 |
+
int incx,
|
| 507 |
+
const cuComplex* y,
|
| 508 |
+
int incy,
|
| 509 |
+
cuComplex* A,
|
| 510 |
+
int lda);
|
| 511 |
+
void CUBLASWINAPI cublasZher2(char uplo,
|
| 512 |
+
int n,
|
| 513 |
+
cuDoubleComplex alpha,
|
| 514 |
+
const cuDoubleComplex* x,
|
| 515 |
+
int incx,
|
| 516 |
+
const cuDoubleComplex* y,
|
| 517 |
+
int incy,
|
| 518 |
+
cuDoubleComplex* A,
|
| 519 |
+
int lda);
|
| 520 |
+
|
| 521 |
+
/*------------------------------------------------------------------------*/
|
| 522 |
+
/* SPR2/HPR2 */
|
| 523 |
+
void CUBLASWINAPI
|
| 524 |
+
cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
|
| 525 |
+
void CUBLASWINAPI
|
| 526 |
+
cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
|
| 527 |
+
void CUBLASWINAPI cublasChpr2(
|
| 528 |
+
char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
|
| 529 |
+
void CUBLASWINAPI cublasZhpr2(char uplo,
|
| 530 |
+
int n,
|
| 531 |
+
cuDoubleComplex alpha,
|
| 532 |
+
const cuDoubleComplex* x,
|
| 533 |
+
int incx,
|
| 534 |
+
const cuDoubleComplex* y,
|
| 535 |
+
int incy,
|
| 536 |
+
cuDoubleComplex* AP);
|
| 537 |
+
/* ------------------------BLAS3 Functions ------------------------------- */
|
| 538 |
+
/* GEMM */
|
| 539 |
+
void CUBLASWINAPI cublasSgemm(char transa,
|
| 540 |
+
char transb,
|
| 541 |
+
int m,
|
| 542 |
+
int n,
|
| 543 |
+
int k,
|
| 544 |
+
float alpha,
|
| 545 |
+
const float* A,
|
| 546 |
+
int lda,
|
| 547 |
+
const float* B,
|
| 548 |
+
int ldb,
|
| 549 |
+
float beta,
|
| 550 |
+
float* C,
|
| 551 |
+
int ldc);
|
| 552 |
+
void CUBLASWINAPI cublasDgemm(char transa,
|
| 553 |
+
char transb,
|
| 554 |
+
int m,
|
| 555 |
+
int n,
|
| 556 |
+
int k,
|
| 557 |
+
double alpha,
|
| 558 |
+
const double* A,
|
| 559 |
+
int lda,
|
| 560 |
+
const double* B,
|
| 561 |
+
int ldb,
|
| 562 |
+
double beta,
|
| 563 |
+
double* C,
|
| 564 |
+
int ldc);
|
| 565 |
+
void CUBLASWINAPI cublasCgemm(char transa,
|
| 566 |
+
char transb,
|
| 567 |
+
int m,
|
| 568 |
+
int n,
|
| 569 |
+
int k,
|
| 570 |
+
cuComplex alpha,
|
| 571 |
+
const cuComplex* A,
|
| 572 |
+
int lda,
|
| 573 |
+
const cuComplex* B,
|
| 574 |
+
int ldb,
|
| 575 |
+
cuComplex beta,
|
| 576 |
+
cuComplex* C,
|
| 577 |
+
int ldc);
|
| 578 |
+
void CUBLASWINAPI cublasZgemm(char transa,
|
| 579 |
+
char transb,
|
| 580 |
+
int m,
|
| 581 |
+
int n,
|
| 582 |
+
int k,
|
| 583 |
+
cuDoubleComplex alpha,
|
| 584 |
+
const cuDoubleComplex* A,
|
| 585 |
+
int lda,
|
| 586 |
+
const cuDoubleComplex* B,
|
| 587 |
+
int ldb,
|
| 588 |
+
cuDoubleComplex beta,
|
| 589 |
+
cuDoubleComplex* C,
|
| 590 |
+
int ldc);
|
| 591 |
+
/* -------------------------------------------------------*/
|
| 592 |
+
/* SYRK */
|
| 593 |
+
void CUBLASWINAPI
|
| 594 |
+
cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
|
| 595 |
+
void CUBLASWINAPI cublasDsyrk(
|
| 596 |
+
char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
|
| 597 |
+
|
| 598 |
+
void CUBLASWINAPI cublasCsyrk(char uplo,
|
| 599 |
+
char trans,
|
| 600 |
+
int n,
|
| 601 |
+
int k,
|
| 602 |
+
cuComplex alpha,
|
| 603 |
+
const cuComplex* A,
|
| 604 |
+
int lda,
|
| 605 |
+
cuComplex beta,
|
| 606 |
+
cuComplex* C,
|
| 607 |
+
int ldc);
|
| 608 |
+
void CUBLASWINAPI cublasZsyrk(char uplo,
|
| 609 |
+
char trans,
|
| 610 |
+
int n,
|
| 611 |
+
int k,
|
| 612 |
+
cuDoubleComplex alpha,
|
| 613 |
+
const cuDoubleComplex* A,
|
| 614 |
+
int lda,
|
| 615 |
+
cuDoubleComplex beta,
|
| 616 |
+
cuDoubleComplex* C,
|
| 617 |
+
int ldc);
|
| 618 |
+
/* ------------------------------------------------------- */
|
| 619 |
+
/* HERK */
|
| 620 |
+
void CUBLASWINAPI cublasCherk(
|
| 621 |
+
char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
|
| 622 |
+
void CUBLASWINAPI cublasZherk(char uplo,
|
| 623 |
+
char trans,
|
| 624 |
+
int n,
|
| 625 |
+
int k,
|
| 626 |
+
double alpha,
|
| 627 |
+
const cuDoubleComplex* A,
|
| 628 |
+
int lda,
|
| 629 |
+
double beta,
|
| 630 |
+
cuDoubleComplex* C,
|
| 631 |
+
int ldc);
|
| 632 |
+
/* ------------------------------------------------------- */
|
| 633 |
+
/* SYR2K */
|
| 634 |
+
void CUBLASWINAPI cublasSsyr2k(char uplo,
|
| 635 |
+
char trans,
|
| 636 |
+
int n,
|
| 637 |
+
int k,
|
| 638 |
+
float alpha,
|
| 639 |
+
const float* A,
|
| 640 |
+
int lda,
|
| 641 |
+
const float* B,
|
| 642 |
+
int ldb,
|
| 643 |
+
float beta,
|
| 644 |
+
float* C,
|
| 645 |
+
int ldc);
|
| 646 |
+
|
| 647 |
+
void CUBLASWINAPI cublasDsyr2k(char uplo,
|
| 648 |
+
char trans,
|
| 649 |
+
int n,
|
| 650 |
+
int k,
|
| 651 |
+
double alpha,
|
| 652 |
+
const double* A,
|
| 653 |
+
int lda,
|
| 654 |
+
const double* B,
|
| 655 |
+
int ldb,
|
| 656 |
+
double beta,
|
| 657 |
+
double* C,
|
| 658 |
+
int ldc);
|
| 659 |
+
void CUBLASWINAPI cublasCsyr2k(char uplo,
|
| 660 |
+
char trans,
|
| 661 |
+
int n,
|
| 662 |
+
int k,
|
| 663 |
+
cuComplex alpha,
|
| 664 |
+
const cuComplex* A,
|
| 665 |
+
int lda,
|
| 666 |
+
const cuComplex* B,
|
| 667 |
+
int ldb,
|
| 668 |
+
cuComplex beta,
|
| 669 |
+
cuComplex* C,
|
| 670 |
+
int ldc);
|
| 671 |
+
|
| 672 |
+
void CUBLASWINAPI cublasZsyr2k(char uplo,
|
| 673 |
+
char trans,
|
| 674 |
+
int n,
|
| 675 |
+
int k,
|
| 676 |
+
cuDoubleComplex alpha,
|
| 677 |
+
const cuDoubleComplex* A,
|
| 678 |
+
int lda,
|
| 679 |
+
const cuDoubleComplex* B,
|
| 680 |
+
int ldb,
|
| 681 |
+
cuDoubleComplex beta,
|
| 682 |
+
cuDoubleComplex* C,
|
| 683 |
+
int ldc);
|
| 684 |
+
/* ------------------------------------------------------- */
|
| 685 |
+
/* HER2K */
|
| 686 |
+
void CUBLASWINAPI cublasCher2k(char uplo,
|
| 687 |
+
char trans,
|
| 688 |
+
int n,
|
| 689 |
+
int k,
|
| 690 |
+
cuComplex alpha,
|
| 691 |
+
const cuComplex* A,
|
| 692 |
+
int lda,
|
| 693 |
+
const cuComplex* B,
|
| 694 |
+
int ldb,
|
| 695 |
+
float beta,
|
| 696 |
+
cuComplex* C,
|
| 697 |
+
int ldc);
|
| 698 |
+
|
| 699 |
+
void CUBLASWINAPI cublasZher2k(char uplo,
|
| 700 |
+
char trans,
|
| 701 |
+
int n,
|
| 702 |
+
int k,
|
| 703 |
+
cuDoubleComplex alpha,
|
| 704 |
+
const cuDoubleComplex* A,
|
| 705 |
+
int lda,
|
| 706 |
+
const cuDoubleComplex* B,
|
| 707 |
+
int ldb,
|
| 708 |
+
double beta,
|
| 709 |
+
cuDoubleComplex* C,
|
| 710 |
+
int ldc);
|
| 711 |
+
|
| 712 |
+
/*------------------------------------------------------------------------*/
|
| 713 |
+
/* SYMM*/
|
| 714 |
+
void CUBLASWINAPI cublasSsymm(char side,
|
| 715 |
+
char uplo,
|
| 716 |
+
int m,
|
| 717 |
+
int n,
|
| 718 |
+
float alpha,
|
| 719 |
+
const float* A,
|
| 720 |
+
int lda,
|
| 721 |
+
const float* B,
|
| 722 |
+
int ldb,
|
| 723 |
+
float beta,
|
| 724 |
+
float* C,
|
| 725 |
+
int ldc);
|
| 726 |
+
void CUBLASWINAPI cublasDsymm(char side,
|
| 727 |
+
char uplo,
|
| 728 |
+
int m,
|
| 729 |
+
int n,
|
| 730 |
+
double alpha,
|
| 731 |
+
const double* A,
|
| 732 |
+
int lda,
|
| 733 |
+
const double* B,
|
| 734 |
+
int ldb,
|
| 735 |
+
double beta,
|
| 736 |
+
double* C,
|
| 737 |
+
int ldc);
|
| 738 |
+
|
| 739 |
+
void CUBLASWINAPI cublasCsymm(char side,
|
| 740 |
+
char uplo,
|
| 741 |
+
int m,
|
| 742 |
+
int n,
|
| 743 |
+
cuComplex alpha,
|
| 744 |
+
const cuComplex* A,
|
| 745 |
+
int lda,
|
| 746 |
+
const cuComplex* B,
|
| 747 |
+
int ldb,
|
| 748 |
+
cuComplex beta,
|
| 749 |
+
cuComplex* C,
|
| 750 |
+
int ldc);
|
| 751 |
+
|
| 752 |
+
void CUBLASWINAPI cublasZsymm(char side,
|
| 753 |
+
char uplo,
|
| 754 |
+
int m,
|
| 755 |
+
int n,
|
| 756 |
+
cuDoubleComplex alpha,
|
| 757 |
+
const cuDoubleComplex* A,
|
| 758 |
+
int lda,
|
| 759 |
+
const cuDoubleComplex* B,
|
| 760 |
+
int ldb,
|
| 761 |
+
cuDoubleComplex beta,
|
| 762 |
+
cuDoubleComplex* C,
|
| 763 |
+
int ldc);
|
| 764 |
+
/*------------------------------------------------------------------------*/
|
| 765 |
+
/* HEMM*/
|
| 766 |
+
void CUBLASWINAPI cublasChemm(char side,
|
| 767 |
+
char uplo,
|
| 768 |
+
int m,
|
| 769 |
+
int n,
|
| 770 |
+
cuComplex alpha,
|
| 771 |
+
const cuComplex* A,
|
| 772 |
+
int lda,
|
| 773 |
+
const cuComplex* B,
|
| 774 |
+
int ldb,
|
| 775 |
+
cuComplex beta,
|
| 776 |
+
cuComplex* C,
|
| 777 |
+
int ldc);
|
| 778 |
+
void CUBLASWINAPI cublasZhemm(char side,
|
| 779 |
+
char uplo,
|
| 780 |
+
int m,
|
| 781 |
+
int n,
|
| 782 |
+
cuDoubleComplex alpha,
|
| 783 |
+
const cuDoubleComplex* A,
|
| 784 |
+
int lda,
|
| 785 |
+
const cuDoubleComplex* B,
|
| 786 |
+
int ldb,
|
| 787 |
+
cuDoubleComplex beta,
|
| 788 |
+
cuDoubleComplex* C,
|
| 789 |
+
int ldc);
|
| 790 |
+
|
| 791 |
+
/*------------------------------------------------------------------------*/
|
| 792 |
+
/* TRSM*/
|
| 793 |
+
void CUBLASWINAPI cublasStrsm(char side,
|
| 794 |
+
char uplo,
|
| 795 |
+
char transa,
|
| 796 |
+
char diag,
|
| 797 |
+
int m,
|
| 798 |
+
int n,
|
| 799 |
+
float alpha,
|
| 800 |
+
const float* A,
|
| 801 |
+
int lda,
|
| 802 |
+
float* B,
|
| 803 |
+
int ldb);
|
| 804 |
+
|
| 805 |
+
void CUBLASWINAPI cublasDtrsm(char side,
|
| 806 |
+
char uplo,
|
| 807 |
+
char transa,
|
| 808 |
+
char diag,
|
| 809 |
+
int m,
|
| 810 |
+
int n,
|
| 811 |
+
double alpha,
|
| 812 |
+
const double* A,
|
| 813 |
+
int lda,
|
| 814 |
+
double* B,
|
| 815 |
+
int ldb);
|
| 816 |
+
|
| 817 |
+
void CUBLASWINAPI cublasCtrsm(char side,
|
| 818 |
+
char uplo,
|
| 819 |
+
char transa,
|
| 820 |
+
char diag,
|
| 821 |
+
int m,
|
| 822 |
+
int n,
|
| 823 |
+
cuComplex alpha,
|
| 824 |
+
const cuComplex* A,
|
| 825 |
+
int lda,
|
| 826 |
+
cuComplex* B,
|
| 827 |
+
int ldb);
|
| 828 |
+
|
| 829 |
+
void CUBLASWINAPI cublasZtrsm(char side,
|
| 830 |
+
char uplo,
|
| 831 |
+
char transa,
|
| 832 |
+
char diag,
|
| 833 |
+
int m,
|
| 834 |
+
int n,
|
| 835 |
+
cuDoubleComplex alpha,
|
| 836 |
+
const cuDoubleComplex* A,
|
| 837 |
+
int lda,
|
| 838 |
+
cuDoubleComplex* B,
|
| 839 |
+
int ldb);
|
| 840 |
+
/*------------------------------------------------------------------------*/
|
| 841 |
+
/* TRMM*/
|
| 842 |
+
void CUBLASWINAPI cublasStrmm(char side,
|
| 843 |
+
char uplo,
|
| 844 |
+
char transa,
|
| 845 |
+
char diag,
|
| 846 |
+
int m,
|
| 847 |
+
int n,
|
| 848 |
+
float alpha,
|
| 849 |
+
const float* A,
|
| 850 |
+
int lda,
|
| 851 |
+
float* B,
|
| 852 |
+
int ldb);
|
| 853 |
+
void CUBLASWINAPI cublasDtrmm(char side,
|
| 854 |
+
char uplo,
|
| 855 |
+
char transa,
|
| 856 |
+
char diag,
|
| 857 |
+
int m,
|
| 858 |
+
int n,
|
| 859 |
+
double alpha,
|
| 860 |
+
const double* A,
|
| 861 |
+
int lda,
|
| 862 |
+
double* B,
|
| 863 |
+
int ldb);
|
| 864 |
+
void CUBLASWINAPI cublasCtrmm(char side,
|
| 865 |
+
char uplo,
|
| 866 |
+
char transa,
|
| 867 |
+
char diag,
|
| 868 |
+
int m,
|
| 869 |
+
int n,
|
| 870 |
+
cuComplex alpha,
|
| 871 |
+
const cuComplex* A,
|
| 872 |
+
int lda,
|
| 873 |
+
cuComplex* B,
|
| 874 |
+
int ldb);
|
| 875 |
+
void CUBLASWINAPI cublasZtrmm(char side,
|
| 876 |
+
char uplo,
|
| 877 |
+
char transa,
|
| 878 |
+
char diag,
|
| 879 |
+
int m,
|
| 880 |
+
int n,
|
| 881 |
+
cuDoubleComplex alpha,
|
| 882 |
+
const cuDoubleComplex* A,
|
| 883 |
+
int lda,
|
| 884 |
+
cuDoubleComplex* B,
|
| 885 |
+
int ldb);
|
| 886 |
+
|
| 887 |
+
#if defined(__cplusplus)
|
| 888 |
+
}
|
| 889 |
+
#endif /* __cplusplus */
|
| 890 |
+
|
| 891 |
+
#endif /* !defined(CUBLAS_H_) */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
|
| 51 |
+
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUBLAS_XT_H_)
|
| 55 |
+
#define CUBLAS_XT_H_
|
| 56 |
+
|
| 57 |
+
#include "driver_types.h"
|
| 58 |
+
#include "cuComplex.h" /* import complex data type */
|
| 59 |
+
|
| 60 |
+
#include "cublas_v2.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__cplusplus)
|
| 63 |
+
extern "C" {
|
| 64 |
+
#endif /* __cplusplus */
|
| 65 |
+
|
| 66 |
+
struct cublasXtContext;
|
| 67 |
+
typedef struct cublasXtContext* cublasXtHandle_t;
|
| 68 |
+
|
| 69 |
+
cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
|
| 70 |
+
cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
|
| 71 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
|
| 72 |
+
cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
|
| 73 |
+
/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
|
| 74 |
+
cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
|
| 75 |
+
|
| 76 |
+
/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
|
| 77 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
|
| 78 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
|
| 79 |
+
|
| 80 |
+
typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
|
| 81 |
+
/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
|
| 82 |
+
are not pinned : Pinning/Unpinning the Host memory is still a costly operation
|
| 83 |
+
It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
|
| 84 |
+
*/
|
| 85 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
|
| 86 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
|
| 87 |
+
|
| 88 |
+
/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
|
| 89 |
+
typedef enum {
|
| 90 |
+
CUBLASXT_FLOAT = 0,
|
| 91 |
+
CUBLASXT_DOUBLE = 1,
|
| 92 |
+
CUBLASXT_COMPLEX = 2,
|
| 93 |
+
CUBLASXT_DOUBLECOMPLEX = 3,
|
| 94 |
+
} cublasXtOpType_t;
|
| 95 |
+
|
| 96 |
+
typedef enum {
|
| 97 |
+
CUBLASXT_GEMM = 0,
|
| 98 |
+
CUBLASXT_SYRK = 1,
|
| 99 |
+
CUBLASXT_HERK = 2,
|
| 100 |
+
CUBLASXT_SYMM = 3,
|
| 101 |
+
CUBLASXT_HEMM = 4,
|
| 102 |
+
CUBLASXT_TRSM = 5,
|
| 103 |
+
CUBLASXT_SYR2K = 6,
|
| 104 |
+
CUBLASXT_HER2K = 7,
|
| 105 |
+
|
| 106 |
+
CUBLASXT_SPMM = 8,
|
| 107 |
+
CUBLASXT_SYRKX = 9,
|
| 108 |
+
CUBLASXT_HERKX = 10,
|
| 109 |
+
CUBLASXT_TRMM = 11,
|
| 110 |
+
CUBLASXT_ROUTINE_MAX = 12,
|
| 111 |
+
} cublasXtBlasOp_t;
|
| 112 |
+
|
| 113 |
+
/* Currently only 32-bit integer BLAS routines are supported */
|
| 114 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
|
| 115 |
+
cublasXtBlasOp_t blasOp,
|
| 116 |
+
cublasXtOpType_t type,
|
| 117 |
+
void* blasFunctor);
|
| 118 |
+
|
| 119 |
+
/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
|
| 120 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
|
| 121 |
+
cublasXtBlasOp_t blasOp,
|
| 122 |
+
cublasXtOpType_t type,
|
| 123 |
+
float ratio);
|
| 124 |
+
|
| 125 |
+
/* GEMM */
|
| 126 |
+
cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
|
| 127 |
+
cublasOperation_t transa,
|
| 128 |
+
cublasOperation_t transb,
|
| 129 |
+
size_t m,
|
| 130 |
+
size_t n,
|
| 131 |
+
size_t k,
|
| 132 |
+
const float* alpha,
|
| 133 |
+
const float* A,
|
| 134 |
+
size_t lda,
|
| 135 |
+
const float* B,
|
| 136 |
+
size_t ldb,
|
| 137 |
+
const float* beta,
|
| 138 |
+
float* C,
|
| 139 |
+
size_t ldc);
|
| 140 |
+
|
| 141 |
+
cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
|
| 142 |
+
cublasOperation_t transa,
|
| 143 |
+
cublasOperation_t transb,
|
| 144 |
+
size_t m,
|
| 145 |
+
size_t n,
|
| 146 |
+
size_t k,
|
| 147 |
+
const double* alpha,
|
| 148 |
+
const double* A,
|
| 149 |
+
size_t lda,
|
| 150 |
+
const double* B,
|
| 151 |
+
size_t ldb,
|
| 152 |
+
const double* beta,
|
| 153 |
+
double* C,
|
| 154 |
+
size_t ldc);
|
| 155 |
+
|
| 156 |
+
cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
|
| 157 |
+
cublasOperation_t transa,
|
| 158 |
+
cublasOperation_t transb,
|
| 159 |
+
size_t m,
|
| 160 |
+
size_t n,
|
| 161 |
+
size_t k,
|
| 162 |
+
const cuComplex* alpha,
|
| 163 |
+
const cuComplex* A,
|
| 164 |
+
size_t lda,
|
| 165 |
+
const cuComplex* B,
|
| 166 |
+
size_t ldb,
|
| 167 |
+
const cuComplex* beta,
|
| 168 |
+
cuComplex* C,
|
| 169 |
+
size_t ldc);
|
| 170 |
+
|
| 171 |
+
cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
|
| 172 |
+
cublasOperation_t transa,
|
| 173 |
+
cublasOperation_t transb,
|
| 174 |
+
size_t m,
|
| 175 |
+
size_t n,
|
| 176 |
+
size_t k,
|
| 177 |
+
const cuDoubleComplex* alpha,
|
| 178 |
+
const cuDoubleComplex* A,
|
| 179 |
+
size_t lda,
|
| 180 |
+
const cuDoubleComplex* B,
|
| 181 |
+
size_t ldb,
|
| 182 |
+
const cuDoubleComplex* beta,
|
| 183 |
+
cuDoubleComplex* C,
|
| 184 |
+
size_t ldc);
|
| 185 |
+
/* ------------------------------------------------------- */
|
| 186 |
+
/* SYRK */
|
| 187 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
|
| 188 |
+
cublasFillMode_t uplo,
|
| 189 |
+
cublasOperation_t trans,
|
| 190 |
+
size_t n,
|
| 191 |
+
size_t k,
|
| 192 |
+
const float* alpha,
|
| 193 |
+
const float* A,
|
| 194 |
+
size_t lda,
|
| 195 |
+
const float* beta,
|
| 196 |
+
float* C,
|
| 197 |
+
size_t ldc);
|
| 198 |
+
|
| 199 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
|
| 200 |
+
cublasFillMode_t uplo,
|
| 201 |
+
cublasOperation_t trans,
|
| 202 |
+
size_t n,
|
| 203 |
+
size_t k,
|
| 204 |
+
const double* alpha,
|
| 205 |
+
const double* A,
|
| 206 |
+
size_t lda,
|
| 207 |
+
const double* beta,
|
| 208 |
+
double* C,
|
| 209 |
+
size_t ldc);
|
| 210 |
+
|
| 211 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
|
| 212 |
+
cublasFillMode_t uplo,
|
| 213 |
+
cublasOperation_t trans,
|
| 214 |
+
size_t n,
|
| 215 |
+
size_t k,
|
| 216 |
+
const cuComplex* alpha,
|
| 217 |
+
const cuComplex* A,
|
| 218 |
+
size_t lda,
|
| 219 |
+
const cuComplex* beta,
|
| 220 |
+
cuComplex* C,
|
| 221 |
+
size_t ldc);
|
| 222 |
+
|
| 223 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
|
| 224 |
+
cublasFillMode_t uplo,
|
| 225 |
+
cublasOperation_t trans,
|
| 226 |
+
size_t n,
|
| 227 |
+
size_t k,
|
| 228 |
+
const cuDoubleComplex* alpha,
|
| 229 |
+
const cuDoubleComplex* A,
|
| 230 |
+
size_t lda,
|
| 231 |
+
const cuDoubleComplex* beta,
|
| 232 |
+
cuDoubleComplex* C,
|
| 233 |
+
size_t ldc);
|
| 234 |
+
/* -------------------------------------------------------------------- */
|
| 235 |
+
/* HERK */
|
| 236 |
+
cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
|
| 237 |
+
cublasFillMode_t uplo,
|
| 238 |
+
cublasOperation_t trans,
|
| 239 |
+
size_t n,
|
| 240 |
+
size_t k,
|
| 241 |
+
const float* alpha,
|
| 242 |
+
const cuComplex* A,
|
| 243 |
+
size_t lda,
|
| 244 |
+
const float* beta,
|
| 245 |
+
cuComplex* C,
|
| 246 |
+
size_t ldc);
|
| 247 |
+
|
| 248 |
+
cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
|
| 249 |
+
cublasFillMode_t uplo,
|
| 250 |
+
cublasOperation_t trans,
|
| 251 |
+
size_t n,
|
| 252 |
+
size_t k,
|
| 253 |
+
const double* alpha,
|
| 254 |
+
const cuDoubleComplex* A,
|
| 255 |
+
size_t lda,
|
| 256 |
+
const double* beta,
|
| 257 |
+
cuDoubleComplex* C,
|
| 258 |
+
size_t ldc);
|
| 259 |
+
/* -------------------------------------------------------------------- */
|
| 260 |
+
/* SYR2K */
|
| 261 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
|
| 262 |
+
cublasFillMode_t uplo,
|
| 263 |
+
cublasOperation_t trans,
|
| 264 |
+
size_t n,
|
| 265 |
+
size_t k,
|
| 266 |
+
const float* alpha,
|
| 267 |
+
const float* A,
|
| 268 |
+
size_t lda,
|
| 269 |
+
const float* B,
|
| 270 |
+
size_t ldb,
|
| 271 |
+
const float* beta,
|
| 272 |
+
float* C,
|
| 273 |
+
size_t ldc);
|
| 274 |
+
|
| 275 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
|
| 276 |
+
cublasFillMode_t uplo,
|
| 277 |
+
cublasOperation_t trans,
|
| 278 |
+
size_t n,
|
| 279 |
+
size_t k,
|
| 280 |
+
const double* alpha,
|
| 281 |
+
const double* A,
|
| 282 |
+
size_t lda,
|
| 283 |
+
const double* B,
|
| 284 |
+
size_t ldb,
|
| 285 |
+
const double* beta,
|
| 286 |
+
double* C,
|
| 287 |
+
size_t ldc);
|
| 288 |
+
|
| 289 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
|
| 290 |
+
cublasFillMode_t uplo,
|
| 291 |
+
cublasOperation_t trans,
|
| 292 |
+
size_t n,
|
| 293 |
+
size_t k,
|
| 294 |
+
const cuComplex* alpha,
|
| 295 |
+
const cuComplex* A,
|
| 296 |
+
size_t lda,
|
| 297 |
+
const cuComplex* B,
|
| 298 |
+
size_t ldb,
|
| 299 |
+
const cuComplex* beta,
|
| 300 |
+
cuComplex* C,
|
| 301 |
+
size_t ldc);
|
| 302 |
+
|
| 303 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
|
| 304 |
+
cublasFillMode_t uplo,
|
| 305 |
+
cublasOperation_t trans,
|
| 306 |
+
size_t n,
|
| 307 |
+
size_t k,
|
| 308 |
+
const cuDoubleComplex* alpha,
|
| 309 |
+
const cuDoubleComplex* A,
|
| 310 |
+
size_t lda,
|
| 311 |
+
const cuDoubleComplex* B,
|
| 312 |
+
size_t ldb,
|
| 313 |
+
const cuDoubleComplex* beta,
|
| 314 |
+
cuDoubleComplex* C,
|
| 315 |
+
size_t ldc);
|
| 316 |
+
/* -------------------------------------------------------------------- */
|
| 317 |
+
/* HERKX : variant extension of HERK */
|
| 318 |
+
cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
|
| 319 |
+
cublasFillMode_t uplo,
|
| 320 |
+
cublasOperation_t trans,
|
| 321 |
+
size_t n,
|
| 322 |
+
size_t k,
|
| 323 |
+
const cuComplex* alpha,
|
| 324 |
+
const cuComplex* A,
|
| 325 |
+
size_t lda,
|
| 326 |
+
const cuComplex* B,
|
| 327 |
+
size_t ldb,
|
| 328 |
+
const float* beta,
|
| 329 |
+
cuComplex* C,
|
| 330 |
+
size_t ldc);
|
| 331 |
+
|
| 332 |
+
cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
|
| 333 |
+
cublasFillMode_t uplo,
|
| 334 |
+
cublasOperation_t trans,
|
| 335 |
+
size_t n,
|
| 336 |
+
size_t k,
|
| 337 |
+
const cuDoubleComplex* alpha,
|
| 338 |
+
const cuDoubleComplex* A,
|
| 339 |
+
size_t lda,
|
| 340 |
+
const cuDoubleComplex* B,
|
| 341 |
+
size_t ldb,
|
| 342 |
+
const double* beta,
|
| 343 |
+
cuDoubleComplex* C,
|
| 344 |
+
size_t ldc);
|
| 345 |
+
|
| 346 |
+
/* -------------------------------------------------------------------- */
|
| 347 |
+
/* TRSM */
|
| 348 |
+
cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
|
| 349 |
+
cublasSideMode_t side,
|
| 350 |
+
cublasFillMode_t uplo,
|
| 351 |
+
cublasOperation_t trans,
|
| 352 |
+
cublasDiagType_t diag,
|
| 353 |
+
size_t m,
|
| 354 |
+
size_t n,
|
| 355 |
+
const float* alpha,
|
| 356 |
+
const float* A,
|
| 357 |
+
size_t lda,
|
| 358 |
+
float* B,
|
| 359 |
+
size_t ldb);
|
| 360 |
+
|
| 361 |
+
cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
|
| 362 |
+
cublasSideMode_t side,
|
| 363 |
+
cublasFillMode_t uplo,
|
| 364 |
+
cublasOperation_t trans,
|
| 365 |
+
cublasDiagType_t diag,
|
| 366 |
+
size_t m,
|
| 367 |
+
size_t n,
|
| 368 |
+
const double* alpha,
|
| 369 |
+
const double* A,
|
| 370 |
+
size_t lda,
|
| 371 |
+
double* B,
|
| 372 |
+
size_t ldb);
|
| 373 |
+
|
| 374 |
+
cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
|
| 375 |
+
cublasSideMode_t side,
|
| 376 |
+
cublasFillMode_t uplo,
|
| 377 |
+
cublasOperation_t trans,
|
| 378 |
+
cublasDiagType_t diag,
|
| 379 |
+
size_t m,
|
| 380 |
+
size_t n,
|
| 381 |
+
const cuComplex* alpha,
|
| 382 |
+
const cuComplex* A,
|
| 383 |
+
size_t lda,
|
| 384 |
+
cuComplex* B,
|
| 385 |
+
size_t ldb);
|
| 386 |
+
|
| 387 |
+
cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
|
| 388 |
+
cublasSideMode_t side,
|
| 389 |
+
cublasFillMode_t uplo,
|
| 390 |
+
cublasOperation_t trans,
|
| 391 |
+
cublasDiagType_t diag,
|
| 392 |
+
size_t m,
|
| 393 |
+
size_t n,
|
| 394 |
+
const cuDoubleComplex* alpha,
|
| 395 |
+
const cuDoubleComplex* A,
|
| 396 |
+
size_t lda,
|
| 397 |
+
cuDoubleComplex* B,
|
| 398 |
+
size_t ldb);
|
| 399 |
+
/* -------------------------------------------------------------------- */
|
| 400 |
+
/* SYMM : Symmetric Multiply Matrix*/
|
| 401 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
|
| 402 |
+
cublasSideMode_t side,
|
| 403 |
+
cublasFillMode_t uplo,
|
| 404 |
+
size_t m,
|
| 405 |
+
size_t n,
|
| 406 |
+
const float* alpha,
|
| 407 |
+
const float* A,
|
| 408 |
+
size_t lda,
|
| 409 |
+
const float* B,
|
| 410 |
+
size_t ldb,
|
| 411 |
+
const float* beta,
|
| 412 |
+
float* C,
|
| 413 |
+
size_t ldc);
|
| 414 |
+
|
| 415 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
|
| 416 |
+
cublasSideMode_t side,
|
| 417 |
+
cublasFillMode_t uplo,
|
| 418 |
+
size_t m,
|
| 419 |
+
size_t n,
|
| 420 |
+
const double* alpha,
|
| 421 |
+
const double* A,
|
| 422 |
+
size_t lda,
|
| 423 |
+
const double* B,
|
| 424 |
+
size_t ldb,
|
| 425 |
+
const double* beta,
|
| 426 |
+
double* C,
|
| 427 |
+
size_t ldc);
|
| 428 |
+
|
| 429 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
|
| 430 |
+
cublasSideMode_t side,
|
| 431 |
+
cublasFillMode_t uplo,
|
| 432 |
+
size_t m,
|
| 433 |
+
size_t n,
|
| 434 |
+
const cuComplex* alpha,
|
| 435 |
+
const cuComplex* A,
|
| 436 |
+
size_t lda,
|
| 437 |
+
const cuComplex* B,
|
| 438 |
+
size_t ldb,
|
| 439 |
+
const cuComplex* beta,
|
| 440 |
+
cuComplex* C,
|
| 441 |
+
size_t ldc);
|
| 442 |
+
|
| 443 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
|
| 444 |
+
cublasSideMode_t side,
|
| 445 |
+
cublasFillMode_t uplo,
|
| 446 |
+
size_t m,
|
| 447 |
+
size_t n,
|
| 448 |
+
const cuDoubleComplex* alpha,
|
| 449 |
+
const cuDoubleComplex* A,
|
| 450 |
+
size_t lda,
|
| 451 |
+
const cuDoubleComplex* B,
|
| 452 |
+
size_t ldb,
|
| 453 |
+
const cuDoubleComplex* beta,
|
| 454 |
+
cuDoubleComplex* C,
|
| 455 |
+
size_t ldc);
|
| 456 |
+
/* -------------------------------------------------------------------- */
|
| 457 |
+
/* HEMM : Hermitian Matrix Multiply */
|
| 458 |
+
cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
|
| 459 |
+
cublasSideMode_t side,
|
| 460 |
+
cublasFillMode_t uplo,
|
| 461 |
+
size_t m,
|
| 462 |
+
size_t n,
|
| 463 |
+
const cuComplex* alpha,
|
| 464 |
+
const cuComplex* A,
|
| 465 |
+
size_t lda,
|
| 466 |
+
const cuComplex* B,
|
| 467 |
+
size_t ldb,
|
| 468 |
+
const cuComplex* beta,
|
| 469 |
+
cuComplex* C,
|
| 470 |
+
size_t ldc);
|
| 471 |
+
|
| 472 |
+
cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
|
| 473 |
+
cublasSideMode_t side,
|
| 474 |
+
cublasFillMode_t uplo,
|
| 475 |
+
size_t m,
|
| 476 |
+
size_t n,
|
| 477 |
+
const cuDoubleComplex* alpha,
|
| 478 |
+
const cuDoubleComplex* A,
|
| 479 |
+
size_t lda,
|
| 480 |
+
const cuDoubleComplex* B,
|
| 481 |
+
size_t ldb,
|
| 482 |
+
const cuDoubleComplex* beta,
|
| 483 |
+
cuDoubleComplex* C,
|
| 484 |
+
size_t ldc);
|
| 485 |
+
|
| 486 |
+
/* -------------------------------------------------------------------- */
|
| 487 |
+
/* SYRKX : variant extension of SYRK */
|
| 488 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
|
| 489 |
+
cublasFillMode_t uplo,
|
| 490 |
+
cublasOperation_t trans,
|
| 491 |
+
size_t n,
|
| 492 |
+
size_t k,
|
| 493 |
+
const float* alpha,
|
| 494 |
+
const float* A,
|
| 495 |
+
size_t lda,
|
| 496 |
+
const float* B,
|
| 497 |
+
size_t ldb,
|
| 498 |
+
const float* beta,
|
| 499 |
+
float* C,
|
| 500 |
+
size_t ldc);
|
| 501 |
+
|
| 502 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
|
| 503 |
+
cublasFillMode_t uplo,
|
| 504 |
+
cublasOperation_t trans,
|
| 505 |
+
size_t n,
|
| 506 |
+
size_t k,
|
| 507 |
+
const double* alpha,
|
| 508 |
+
const double* A,
|
| 509 |
+
size_t lda,
|
| 510 |
+
const double* B,
|
| 511 |
+
size_t ldb,
|
| 512 |
+
const double* beta,
|
| 513 |
+
double* C,
|
| 514 |
+
size_t ldc);
|
| 515 |
+
|
| 516 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
|
| 517 |
+
cublasFillMode_t uplo,
|
| 518 |
+
cublasOperation_t trans,
|
| 519 |
+
size_t n,
|
| 520 |
+
size_t k,
|
| 521 |
+
const cuComplex* alpha,
|
| 522 |
+
const cuComplex* A,
|
| 523 |
+
size_t lda,
|
| 524 |
+
const cuComplex* B,
|
| 525 |
+
size_t ldb,
|
| 526 |
+
const cuComplex* beta,
|
| 527 |
+
cuComplex* C,
|
| 528 |
+
size_t ldc);
|
| 529 |
+
|
| 530 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
|
| 531 |
+
cublasFillMode_t uplo,
|
| 532 |
+
cublasOperation_t trans,
|
| 533 |
+
size_t n,
|
| 534 |
+
size_t k,
|
| 535 |
+
const cuDoubleComplex* alpha,
|
| 536 |
+
const cuDoubleComplex* A,
|
| 537 |
+
size_t lda,
|
| 538 |
+
const cuDoubleComplex* B,
|
| 539 |
+
size_t ldb,
|
| 540 |
+
const cuDoubleComplex* beta,
|
| 541 |
+
cuDoubleComplex* C,
|
| 542 |
+
size_t ldc);
|
| 543 |
+
/* -------------------------------------------------------------------- */
|
| 544 |
+
/* HER2K : variant extension of HERK */
|
| 545 |
+
cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
|
| 546 |
+
cublasFillMode_t uplo,
|
| 547 |
+
cublasOperation_t trans,
|
| 548 |
+
size_t n,
|
| 549 |
+
size_t k,
|
| 550 |
+
const cuComplex* alpha,
|
| 551 |
+
const cuComplex* A,
|
| 552 |
+
size_t lda,
|
| 553 |
+
const cuComplex* B,
|
| 554 |
+
size_t ldb,
|
| 555 |
+
const float* beta,
|
| 556 |
+
cuComplex* C,
|
| 557 |
+
size_t ldc);
|
| 558 |
+
|
| 559 |
+
cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
|
| 560 |
+
cublasFillMode_t uplo,
|
| 561 |
+
cublasOperation_t trans,
|
| 562 |
+
size_t n,
|
| 563 |
+
size_t k,
|
| 564 |
+
const cuDoubleComplex* alpha,
|
| 565 |
+
const cuDoubleComplex* A,
|
| 566 |
+
size_t lda,
|
| 567 |
+
const cuDoubleComplex* B,
|
| 568 |
+
size_t ldb,
|
| 569 |
+
const double* beta,
|
| 570 |
+
cuDoubleComplex* C,
|
| 571 |
+
size_t ldc);
|
| 572 |
+
|
| 573 |
+
/* -------------------------------------------------------------------- */
|
| 574 |
+
/* SPMM : Symmetric Packed Multiply Matrix*/
|
| 575 |
+
cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
|
| 576 |
+
cublasSideMode_t side,
|
| 577 |
+
cublasFillMode_t uplo,
|
| 578 |
+
size_t m,
|
| 579 |
+
size_t n,
|
| 580 |
+
const float* alpha,
|
| 581 |
+
const float* AP,
|
| 582 |
+
const float* B,
|
| 583 |
+
size_t ldb,
|
| 584 |
+
const float* beta,
|
| 585 |
+
float* C,
|
| 586 |
+
size_t ldc);
|
| 587 |
+
|
| 588 |
+
cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
|
| 589 |
+
cublasSideMode_t side,
|
| 590 |
+
cublasFillMode_t uplo,
|
| 591 |
+
size_t m,
|
| 592 |
+
size_t n,
|
| 593 |
+
const double* alpha,
|
| 594 |
+
const double* AP,
|
| 595 |
+
const double* B,
|
| 596 |
+
size_t ldb,
|
| 597 |
+
const double* beta,
|
| 598 |
+
double* C,
|
| 599 |
+
size_t ldc);
|
| 600 |
+
|
| 601 |
+
cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
|
| 602 |
+
cublasSideMode_t side,
|
| 603 |
+
cublasFillMode_t uplo,
|
| 604 |
+
size_t m,
|
| 605 |
+
size_t n,
|
| 606 |
+
const cuComplex* alpha,
|
| 607 |
+
const cuComplex* AP,
|
| 608 |
+
const cuComplex* B,
|
| 609 |
+
size_t ldb,
|
| 610 |
+
const cuComplex* beta,
|
| 611 |
+
cuComplex* C,
|
| 612 |
+
size_t ldc);
|
| 613 |
+
|
| 614 |
+
cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
|
| 615 |
+
cublasSideMode_t side,
|
| 616 |
+
cublasFillMode_t uplo,
|
| 617 |
+
size_t m,
|
| 618 |
+
size_t n,
|
| 619 |
+
const cuDoubleComplex* alpha,
|
| 620 |
+
const cuDoubleComplex* AP,
|
| 621 |
+
const cuDoubleComplex* B,
|
| 622 |
+
size_t ldb,
|
| 623 |
+
const cuDoubleComplex* beta,
|
| 624 |
+
cuDoubleComplex* C,
|
| 625 |
+
size_t ldc);
|
| 626 |
+
|
| 627 |
+
/* -------------------------------------------------------------------- */
|
| 628 |
+
/* TRMM */
|
| 629 |
+
cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
|
| 630 |
+
cublasSideMode_t side,
|
| 631 |
+
cublasFillMode_t uplo,
|
| 632 |
+
cublasOperation_t trans,
|
| 633 |
+
cublasDiagType_t diag,
|
| 634 |
+
size_t m,
|
| 635 |
+
size_t n,
|
| 636 |
+
const float* alpha,
|
| 637 |
+
const float* A,
|
| 638 |
+
size_t lda,
|
| 639 |
+
const float* B,
|
| 640 |
+
size_t ldb,
|
| 641 |
+
float* C,
|
| 642 |
+
size_t ldc);
|
| 643 |
+
|
| 644 |
+
cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
|
| 645 |
+
cublasSideMode_t side,
|
| 646 |
+
cublasFillMode_t uplo,
|
| 647 |
+
cublasOperation_t trans,
|
| 648 |
+
cublasDiagType_t diag,
|
| 649 |
+
size_t m,
|
| 650 |
+
size_t n,
|
| 651 |
+
const double* alpha,
|
| 652 |
+
const double* A,
|
| 653 |
+
size_t lda,
|
| 654 |
+
const double* B,
|
| 655 |
+
size_t ldb,
|
| 656 |
+
double* C,
|
| 657 |
+
size_t ldc);
|
| 658 |
+
|
| 659 |
+
cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
|
| 660 |
+
cublasSideMode_t side,
|
| 661 |
+
cublasFillMode_t uplo,
|
| 662 |
+
cublasOperation_t trans,
|
| 663 |
+
cublasDiagType_t diag,
|
| 664 |
+
size_t m,
|
| 665 |
+
size_t n,
|
| 666 |
+
const cuComplex* alpha,
|
| 667 |
+
const cuComplex* A,
|
| 668 |
+
size_t lda,
|
| 669 |
+
const cuComplex* B,
|
| 670 |
+
size_t ldb,
|
| 671 |
+
cuComplex* C,
|
| 672 |
+
size_t ldc);
|
| 673 |
+
|
| 674 |
+
cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
|
| 675 |
+
cublasSideMode_t side,
|
| 676 |
+
cublasFillMode_t uplo,
|
| 677 |
+
cublasOperation_t trans,
|
| 678 |
+
cublasDiagType_t diag,
|
| 679 |
+
size_t m,
|
| 680 |
+
size_t n,
|
| 681 |
+
const cuDoubleComplex* alpha,
|
| 682 |
+
const cuDoubleComplex* A,
|
| 683 |
+
size_t lda,
|
| 684 |
+
const cuDoubleComplex* B,
|
| 685 |
+
size_t ldb,
|
| 686 |
+
cuDoubleComplex* C,
|
| 687 |
+
size_t ldc);
|
| 688 |
+
|
| 689 |
+
#if defined(__cplusplus)
|
| 690 |
+
}
|
| 691 |
+
#endif /* __cplusplus */
|
| 692 |
+
|
| 693 |
+
#endif /* !defined(CUBLAS_XT_H_) */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h
ADDED
|
@@ -0,0 +1,824 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(NVBLAS_H_)
|
| 51 |
+
#define NVBLAS_H_
|
| 52 |
+
|
| 53 |
+
#include "driver_types.h"
|
| 54 |
+
#include "cuComplex.h" /* import complex data type */
|
| 55 |
+
|
| 56 |
+
#if defined(__cplusplus)
|
| 57 |
+
extern "C" {
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
/* GEMM */
|
| 61 |
+
void sgemm_(const char* transa,
|
| 62 |
+
const char* transb,
|
| 63 |
+
const int* m,
|
| 64 |
+
const int* n,
|
| 65 |
+
const int* k,
|
| 66 |
+
const float* alpha,
|
| 67 |
+
const float* a,
|
| 68 |
+
const int* lda,
|
| 69 |
+
const float* b,
|
| 70 |
+
const int* ldb,
|
| 71 |
+
const float* beta,
|
| 72 |
+
float* c,
|
| 73 |
+
const int* ldc);
|
| 74 |
+
|
| 75 |
+
void dgemm_(const char* transa,
|
| 76 |
+
const char* transb,
|
| 77 |
+
const int* m,
|
| 78 |
+
const int* n,
|
| 79 |
+
const int* k,
|
| 80 |
+
const double* alpha,
|
| 81 |
+
const double* a,
|
| 82 |
+
const int* lda,
|
| 83 |
+
const double* b,
|
| 84 |
+
const int* ldb,
|
| 85 |
+
const double* beta,
|
| 86 |
+
double* c,
|
| 87 |
+
const int* ldc);
|
| 88 |
+
|
| 89 |
+
void cgemm_(const char* transa,
|
| 90 |
+
const char* transb,
|
| 91 |
+
const int* m,
|
| 92 |
+
const int* n,
|
| 93 |
+
const int* k,
|
| 94 |
+
const cuComplex* alpha,
|
| 95 |
+
const cuComplex* a,
|
| 96 |
+
const int* lda,
|
| 97 |
+
const cuComplex* b,
|
| 98 |
+
const int* ldb,
|
| 99 |
+
const cuComplex* beta,
|
| 100 |
+
cuComplex* c,
|
| 101 |
+
const int* ldc);
|
| 102 |
+
|
| 103 |
+
void zgemm_(const char* transa,
|
| 104 |
+
const char* transb,
|
| 105 |
+
const int* m,
|
| 106 |
+
const int* n,
|
| 107 |
+
const int* k,
|
| 108 |
+
const cuDoubleComplex* alpha,
|
| 109 |
+
const cuDoubleComplex* a,
|
| 110 |
+
const int* lda,
|
| 111 |
+
const cuDoubleComplex* b,
|
| 112 |
+
const int* ldb,
|
| 113 |
+
const cuDoubleComplex* beta,
|
| 114 |
+
cuDoubleComplex* c,
|
| 115 |
+
const int* ldc);
|
| 116 |
+
|
| 117 |
+
void sgemm(const char* transa,
|
| 118 |
+
const char* transb,
|
| 119 |
+
const int* m,
|
| 120 |
+
const int* n,
|
| 121 |
+
const int* k,
|
| 122 |
+
const float* alpha,
|
| 123 |
+
const float* a,
|
| 124 |
+
const int* lda,
|
| 125 |
+
const float* b,
|
| 126 |
+
const int* ldb,
|
| 127 |
+
const float* beta,
|
| 128 |
+
float* c,
|
| 129 |
+
const int* ldc);
|
| 130 |
+
|
| 131 |
+
void dgemm(const char* transa,
|
| 132 |
+
const char* transb,
|
| 133 |
+
const int* m,
|
| 134 |
+
const int* n,
|
| 135 |
+
const int* k,
|
| 136 |
+
const double* alpha,
|
| 137 |
+
const double* a,
|
| 138 |
+
const int* lda,
|
| 139 |
+
const double* b,
|
| 140 |
+
const int* ldb,
|
| 141 |
+
const double* beta,
|
| 142 |
+
double* c,
|
| 143 |
+
const int* ldc);
|
| 144 |
+
|
| 145 |
+
void cgemm(const char* transa,
|
| 146 |
+
const char* transb,
|
| 147 |
+
const int* m,
|
| 148 |
+
const int* n,
|
| 149 |
+
const int* k,
|
| 150 |
+
const cuComplex* alpha,
|
| 151 |
+
const cuComplex* a,
|
| 152 |
+
const int* lda,
|
| 153 |
+
const cuComplex* b,
|
| 154 |
+
const int* ldb,
|
| 155 |
+
const cuComplex* beta,
|
| 156 |
+
cuComplex* c,
|
| 157 |
+
const int* ldc);
|
| 158 |
+
|
| 159 |
+
void zgemm(const char* transa,
|
| 160 |
+
const char* transb,
|
| 161 |
+
const int* m,
|
| 162 |
+
const int* n,
|
| 163 |
+
const int* k,
|
| 164 |
+
const cuDoubleComplex* alpha,
|
| 165 |
+
const cuDoubleComplex* a,
|
| 166 |
+
const int* lda,
|
| 167 |
+
const cuDoubleComplex* b,
|
| 168 |
+
const int* ldb,
|
| 169 |
+
const cuDoubleComplex* beta,
|
| 170 |
+
cuDoubleComplex* c,
|
| 171 |
+
const int* ldc);
|
| 172 |
+
|
| 173 |
+
/* SYRK */
|
| 174 |
+
void ssyrk_(const char* uplo,
|
| 175 |
+
const char* trans,
|
| 176 |
+
const int* n,
|
| 177 |
+
const int* k,
|
| 178 |
+
const float* alpha,
|
| 179 |
+
const float* a,
|
| 180 |
+
const int* lda,
|
| 181 |
+
const float* beta,
|
| 182 |
+
float* c,
|
| 183 |
+
const int* ldc);
|
| 184 |
+
|
| 185 |
+
void dsyrk_(const char* uplo,
|
| 186 |
+
const char* trans,
|
| 187 |
+
const int* n,
|
| 188 |
+
const int* k,
|
| 189 |
+
const double* alpha,
|
| 190 |
+
const double* a,
|
| 191 |
+
const int* lda,
|
| 192 |
+
const double* beta,
|
| 193 |
+
double* c,
|
| 194 |
+
const int* ldc);
|
| 195 |
+
|
| 196 |
+
void csyrk_(const char* uplo,
|
| 197 |
+
const char* trans,
|
| 198 |
+
const int* n,
|
| 199 |
+
const int* k,
|
| 200 |
+
const cuComplex* alpha,
|
| 201 |
+
const cuComplex* a,
|
| 202 |
+
const int* lda,
|
| 203 |
+
const cuComplex* beta,
|
| 204 |
+
cuComplex* c,
|
| 205 |
+
const int* ldc);
|
| 206 |
+
|
| 207 |
+
void zsyrk_(const char* uplo,
|
| 208 |
+
const char* trans,
|
| 209 |
+
const int* n,
|
| 210 |
+
const int* k,
|
| 211 |
+
const cuDoubleComplex* alpha,
|
| 212 |
+
const cuDoubleComplex* a,
|
| 213 |
+
const int* lda,
|
| 214 |
+
const cuDoubleComplex* beta,
|
| 215 |
+
cuDoubleComplex* c,
|
| 216 |
+
const int* ldc);
|
| 217 |
+
|
| 218 |
+
void ssyrk(const char* uplo,
|
| 219 |
+
const char* trans,
|
| 220 |
+
const int* n,
|
| 221 |
+
const int* k,
|
| 222 |
+
const float* alpha,
|
| 223 |
+
const float* a,
|
| 224 |
+
const int* lda,
|
| 225 |
+
const float* beta,
|
| 226 |
+
float* c,
|
| 227 |
+
const int* ldc);
|
| 228 |
+
|
| 229 |
+
void dsyrk(const char* uplo,
|
| 230 |
+
const char* trans,
|
| 231 |
+
const int* n,
|
| 232 |
+
const int* k,
|
| 233 |
+
const double* alpha,
|
| 234 |
+
const double* a,
|
| 235 |
+
const int* lda,
|
| 236 |
+
const double* beta,
|
| 237 |
+
double* c,
|
| 238 |
+
const int* ldc);
|
| 239 |
+
|
| 240 |
+
void csyrk(const char* uplo,
|
| 241 |
+
const char* trans,
|
| 242 |
+
const int* n,
|
| 243 |
+
const int* k,
|
| 244 |
+
const cuComplex* alpha,
|
| 245 |
+
const cuComplex* a,
|
| 246 |
+
const int* lda,
|
| 247 |
+
const cuComplex* beta,
|
| 248 |
+
cuComplex* c,
|
| 249 |
+
const int* ldc);
|
| 250 |
+
|
| 251 |
+
void zsyrk(const char* uplo,
|
| 252 |
+
const char* trans,
|
| 253 |
+
const int* n,
|
| 254 |
+
const int* k,
|
| 255 |
+
const cuDoubleComplex* alpha,
|
| 256 |
+
const cuDoubleComplex* a,
|
| 257 |
+
const int* lda,
|
| 258 |
+
const cuDoubleComplex* beta,
|
| 259 |
+
cuDoubleComplex* c,
|
| 260 |
+
const int* ldc);
|
| 261 |
+
|
| 262 |
+
/* HERK */
|
| 263 |
+
void cherk_(const char* uplo,
|
| 264 |
+
const char* trans,
|
| 265 |
+
const int* n,
|
| 266 |
+
const int* k,
|
| 267 |
+
const float* alpha,
|
| 268 |
+
const cuComplex* a,
|
| 269 |
+
const int* lda,
|
| 270 |
+
const float* beta,
|
| 271 |
+
cuComplex* c,
|
| 272 |
+
const int* ldc);
|
| 273 |
+
|
| 274 |
+
void zherk_(const char* uplo,
|
| 275 |
+
const char* trans,
|
| 276 |
+
const int* n,
|
| 277 |
+
const int* k,
|
| 278 |
+
const double* alpha,
|
| 279 |
+
const cuDoubleComplex* a,
|
| 280 |
+
const int* lda,
|
| 281 |
+
const double* beta,
|
| 282 |
+
cuDoubleComplex* c,
|
| 283 |
+
const int* ldc);
|
| 284 |
+
|
| 285 |
+
void cherk(const char* uplo,
|
| 286 |
+
const char* trans,
|
| 287 |
+
const int* n,
|
| 288 |
+
const int* k,
|
| 289 |
+
const float* alpha,
|
| 290 |
+
const cuComplex* a,
|
| 291 |
+
const int* lda,
|
| 292 |
+
const float* beta,
|
| 293 |
+
cuComplex* c,
|
| 294 |
+
const int* ldc);
|
| 295 |
+
|
| 296 |
+
void zherk(const char* uplo,
|
| 297 |
+
const char* trans,
|
| 298 |
+
const int* n,
|
| 299 |
+
const int* k,
|
| 300 |
+
const double* alpha,
|
| 301 |
+
const cuDoubleComplex* a,
|
| 302 |
+
const int* lda,
|
| 303 |
+
const double* beta,
|
| 304 |
+
cuDoubleComplex* c,
|
| 305 |
+
const int* ldc);
|
| 306 |
+
|
| 307 |
+
/* TRSM */
|
| 308 |
+
void strsm_(const char* side,
|
| 309 |
+
const char* uplo,
|
| 310 |
+
const char* transa,
|
| 311 |
+
const char* diag,
|
| 312 |
+
const int* m,
|
| 313 |
+
const int* n,
|
| 314 |
+
const float* alpha,
|
| 315 |
+
const float* a,
|
| 316 |
+
const int* lda,
|
| 317 |
+
float* b,
|
| 318 |
+
const int* ldb);
|
| 319 |
+
|
| 320 |
+
void dtrsm_(const char* side,
|
| 321 |
+
const char* uplo,
|
| 322 |
+
const char* transa,
|
| 323 |
+
const char* diag,
|
| 324 |
+
const int* m,
|
| 325 |
+
const int* n,
|
| 326 |
+
const double* alpha,
|
| 327 |
+
const double* a,
|
| 328 |
+
const int* lda,
|
| 329 |
+
double* b,
|
| 330 |
+
const int* ldb);
|
| 331 |
+
|
| 332 |
+
void ctrsm_(const char* side,
|
| 333 |
+
const char* uplo,
|
| 334 |
+
const char* transa,
|
| 335 |
+
const char* diag,
|
| 336 |
+
const int* m,
|
| 337 |
+
const int* n,
|
| 338 |
+
const cuComplex* alpha,
|
| 339 |
+
const cuComplex* a,
|
| 340 |
+
const int* lda,
|
| 341 |
+
cuComplex* b,
|
| 342 |
+
const int* ldb);
|
| 343 |
+
|
| 344 |
+
void ztrsm_(const char* side,
|
| 345 |
+
const char* uplo,
|
| 346 |
+
const char* transa,
|
| 347 |
+
const char* diag,
|
| 348 |
+
const int* m,
|
| 349 |
+
const int* n,
|
| 350 |
+
const cuDoubleComplex* alpha,
|
| 351 |
+
const cuDoubleComplex* a,
|
| 352 |
+
const int* lda,
|
| 353 |
+
cuDoubleComplex* b,
|
| 354 |
+
const int* ldb);
|
| 355 |
+
|
| 356 |
+
void strsm(const char* side,
|
| 357 |
+
const char* uplo,
|
| 358 |
+
const char* transa,
|
| 359 |
+
const char* diag,
|
| 360 |
+
const int* m,
|
| 361 |
+
const int* n,
|
| 362 |
+
const float* alpha,
|
| 363 |
+
const float* a,
|
| 364 |
+
const int* lda,
|
| 365 |
+
float* b,
|
| 366 |
+
const int* ldb);
|
| 367 |
+
|
| 368 |
+
void dtrsm(const char* side,
|
| 369 |
+
const char* uplo,
|
| 370 |
+
const char* transa,
|
| 371 |
+
const char* diag,
|
| 372 |
+
const int* m,
|
| 373 |
+
const int* n,
|
| 374 |
+
const double* alpha,
|
| 375 |
+
const double* a,
|
| 376 |
+
const int* lda,
|
| 377 |
+
double* b,
|
| 378 |
+
const int* ldb);
|
| 379 |
+
|
| 380 |
+
void ctrsm(const char* side,
|
| 381 |
+
const char* uplo,
|
| 382 |
+
const char* transa,
|
| 383 |
+
const char* diag,
|
| 384 |
+
const int* m,
|
| 385 |
+
const int* n,
|
| 386 |
+
const cuComplex* alpha,
|
| 387 |
+
const cuComplex* a,
|
| 388 |
+
const int* lda,
|
| 389 |
+
cuComplex* b,
|
| 390 |
+
const int* ldb);
|
| 391 |
+
|
| 392 |
+
void ztrsm(const char* side,
|
| 393 |
+
const char* uplo,
|
| 394 |
+
const char* transa,
|
| 395 |
+
const char* diag,
|
| 396 |
+
const int* m,
|
| 397 |
+
const int* n,
|
| 398 |
+
const cuDoubleComplex* alpha,
|
| 399 |
+
const cuDoubleComplex* a,
|
| 400 |
+
const int* lda,
|
| 401 |
+
cuDoubleComplex* b,
|
| 402 |
+
const int* ldb);
|
| 403 |
+
|
| 404 |
+
/* SYMM */
|
| 405 |
+
void ssymm_(const char* side,
|
| 406 |
+
const char* uplo,
|
| 407 |
+
const int* m,
|
| 408 |
+
const int* n,
|
| 409 |
+
const float* alpha,
|
| 410 |
+
const float* a,
|
| 411 |
+
const int* lda,
|
| 412 |
+
const float* b,
|
| 413 |
+
const int* ldb,
|
| 414 |
+
const float* beta,
|
| 415 |
+
float* c,
|
| 416 |
+
const int* ldc);
|
| 417 |
+
|
| 418 |
+
void dsymm_(const char* side,
|
| 419 |
+
const char* uplo,
|
| 420 |
+
const int* m,
|
| 421 |
+
const int* n,
|
| 422 |
+
const double* alpha,
|
| 423 |
+
const double* a,
|
| 424 |
+
const int* lda,
|
| 425 |
+
const double* b,
|
| 426 |
+
const int* ldb,
|
| 427 |
+
const double* beta,
|
| 428 |
+
double* c,
|
| 429 |
+
const int* ldc);
|
| 430 |
+
|
| 431 |
+
void csymm_(const char* side,
|
| 432 |
+
const char* uplo,
|
| 433 |
+
const int* m,
|
| 434 |
+
const int* n,
|
| 435 |
+
const cuComplex* alpha,
|
| 436 |
+
const cuComplex* a,
|
| 437 |
+
const int* lda,
|
| 438 |
+
const cuComplex* b,
|
| 439 |
+
const int* ldb,
|
| 440 |
+
const cuComplex* beta,
|
| 441 |
+
cuComplex* c,
|
| 442 |
+
const int* ldc);
|
| 443 |
+
|
| 444 |
+
void zsymm_(const char* side,
|
| 445 |
+
const char* uplo,
|
| 446 |
+
const int* m,
|
| 447 |
+
const int* n,
|
| 448 |
+
const cuDoubleComplex* alpha,
|
| 449 |
+
const cuDoubleComplex* a,
|
| 450 |
+
const int* lda,
|
| 451 |
+
const cuDoubleComplex* b,
|
| 452 |
+
const int* ldb,
|
| 453 |
+
const cuDoubleComplex* beta,
|
| 454 |
+
cuDoubleComplex* c,
|
| 455 |
+
const int* ldc);
|
| 456 |
+
|
| 457 |
+
void ssymm(const char* side,
|
| 458 |
+
const char* uplo,
|
| 459 |
+
const int* m,
|
| 460 |
+
const int* n,
|
| 461 |
+
const float* alpha,
|
| 462 |
+
const float* a,
|
| 463 |
+
const int* lda,
|
| 464 |
+
const float* b,
|
| 465 |
+
const int* ldb,
|
| 466 |
+
const float* beta,
|
| 467 |
+
float* c,
|
| 468 |
+
const int* ldc);
|
| 469 |
+
|
| 470 |
+
void dsymm(const char* side,
|
| 471 |
+
const char* uplo,
|
| 472 |
+
const int* m,
|
| 473 |
+
const int* n,
|
| 474 |
+
const double* alpha,
|
| 475 |
+
const double* a,
|
| 476 |
+
const int* lda,
|
| 477 |
+
const double* b,
|
| 478 |
+
const int* ldb,
|
| 479 |
+
const double* beta,
|
| 480 |
+
double* c,
|
| 481 |
+
const int* ldc);
|
| 482 |
+
|
| 483 |
+
void csymm(const char* side,
|
| 484 |
+
const char* uplo,
|
| 485 |
+
const int* m,
|
| 486 |
+
const int* n,
|
| 487 |
+
const cuComplex* alpha,
|
| 488 |
+
const cuComplex* a,
|
| 489 |
+
const int* lda,
|
| 490 |
+
const cuComplex* b,
|
| 491 |
+
const int* ldb,
|
| 492 |
+
const cuComplex* beta,
|
| 493 |
+
cuComplex* c,
|
| 494 |
+
const int* ldc);
|
| 495 |
+
|
| 496 |
+
void zsymm(const char* side,
|
| 497 |
+
const char* uplo,
|
| 498 |
+
const int* m,
|
| 499 |
+
const int* n,
|
| 500 |
+
const cuDoubleComplex* alpha,
|
| 501 |
+
const cuDoubleComplex* a,
|
| 502 |
+
const int* lda,
|
| 503 |
+
const cuDoubleComplex* b,
|
| 504 |
+
const int* ldb,
|
| 505 |
+
const cuDoubleComplex* beta,
|
| 506 |
+
cuDoubleComplex* c,
|
| 507 |
+
const int* ldc);
|
| 508 |
+
|
| 509 |
+
/* HEMM */
|
| 510 |
+
void chemm_(const char* side,
|
| 511 |
+
const char* uplo,
|
| 512 |
+
const int* m,
|
| 513 |
+
const int* n,
|
| 514 |
+
const cuComplex* alpha,
|
| 515 |
+
const cuComplex* a,
|
| 516 |
+
const int* lda,
|
| 517 |
+
const cuComplex* b,
|
| 518 |
+
const int* ldb,
|
| 519 |
+
const cuComplex* beta,
|
| 520 |
+
cuComplex* c,
|
| 521 |
+
const int* ldc);
|
| 522 |
+
|
| 523 |
+
void zhemm_(const char* side,
|
| 524 |
+
const char* uplo,
|
| 525 |
+
const int* m,
|
| 526 |
+
const int* n,
|
| 527 |
+
const cuDoubleComplex* alpha,
|
| 528 |
+
const cuDoubleComplex* a,
|
| 529 |
+
const int* lda,
|
| 530 |
+
const cuDoubleComplex* b,
|
| 531 |
+
const int* ldb,
|
| 532 |
+
const cuDoubleComplex* beta,
|
| 533 |
+
cuDoubleComplex* c,
|
| 534 |
+
const int* ldc);
|
| 535 |
+
|
| 536 |
+
/* HEMM with no underscore*/
|
| 537 |
+
void chemm(const char* side,
|
| 538 |
+
const char* uplo,
|
| 539 |
+
const int* m,
|
| 540 |
+
const int* n,
|
| 541 |
+
const cuComplex* alpha,
|
| 542 |
+
const cuComplex* a,
|
| 543 |
+
const int* lda,
|
| 544 |
+
const cuComplex* b,
|
| 545 |
+
const int* ldb,
|
| 546 |
+
const cuComplex* beta,
|
| 547 |
+
cuComplex* c,
|
| 548 |
+
const int* ldc);
|
| 549 |
+
|
| 550 |
+
void zhemm(const char* side,
|
| 551 |
+
const char* uplo,
|
| 552 |
+
const int* m,
|
| 553 |
+
const int* n,
|
| 554 |
+
const cuDoubleComplex* alpha,
|
| 555 |
+
const cuDoubleComplex* a,
|
| 556 |
+
const int* lda,
|
| 557 |
+
const cuDoubleComplex* b,
|
| 558 |
+
const int* ldb,
|
| 559 |
+
const cuDoubleComplex* beta,
|
| 560 |
+
cuDoubleComplex* c,
|
| 561 |
+
const int* ldc);
|
| 562 |
+
|
| 563 |
+
/* SYR2K */
|
| 564 |
+
void ssyr2k_(const char* uplo,
|
| 565 |
+
const char* trans,
|
| 566 |
+
const int* n,
|
| 567 |
+
const int* k,
|
| 568 |
+
const float* alpha,
|
| 569 |
+
const float* a,
|
| 570 |
+
const int* lda,
|
| 571 |
+
const float* b,
|
| 572 |
+
const int* ldb,
|
| 573 |
+
const float* beta,
|
| 574 |
+
float* c,
|
| 575 |
+
const int* ldc);
|
| 576 |
+
|
| 577 |
+
void dsyr2k_(const char* uplo,
|
| 578 |
+
const char* trans,
|
| 579 |
+
const int* n,
|
| 580 |
+
const int* k,
|
| 581 |
+
const double* alpha,
|
| 582 |
+
const double* a,
|
| 583 |
+
const int* lda,
|
| 584 |
+
const double* b,
|
| 585 |
+
const int* ldb,
|
| 586 |
+
const double* beta,
|
| 587 |
+
double* c,
|
| 588 |
+
const int* ldc);
|
| 589 |
+
|
| 590 |
+
void csyr2k_(const char* uplo,
|
| 591 |
+
const char* trans,
|
| 592 |
+
const int* n,
|
| 593 |
+
const int* k,
|
| 594 |
+
const cuComplex* alpha,
|
| 595 |
+
const cuComplex* a,
|
| 596 |
+
const int* lda,
|
| 597 |
+
const cuComplex* b,
|
| 598 |
+
const int* ldb,
|
| 599 |
+
const cuComplex* beta,
|
| 600 |
+
cuComplex* c,
|
| 601 |
+
const int* ldc);
|
| 602 |
+
|
| 603 |
+
void zsyr2k_(const char* uplo,
|
| 604 |
+
const char* trans,
|
| 605 |
+
const int* n,
|
| 606 |
+
const int* k,
|
| 607 |
+
const cuDoubleComplex* alpha,
|
| 608 |
+
const cuDoubleComplex* a,
|
| 609 |
+
const int* lda,
|
| 610 |
+
const cuDoubleComplex* b,
|
| 611 |
+
const int* ldb,
|
| 612 |
+
const cuDoubleComplex* beta,
|
| 613 |
+
cuDoubleComplex* c,
|
| 614 |
+
const int* ldc);
|
| 615 |
+
|
| 616 |
+
/* SYR2K no_underscore*/
|
| 617 |
+
void ssyr2k(const char* uplo,
|
| 618 |
+
const char* trans,
|
| 619 |
+
const int* n,
|
| 620 |
+
const int* k,
|
| 621 |
+
const float* alpha,
|
| 622 |
+
const float* a,
|
| 623 |
+
const int* lda,
|
| 624 |
+
const float* b,
|
| 625 |
+
const int* ldb,
|
| 626 |
+
const float* beta,
|
| 627 |
+
float* c,
|
| 628 |
+
const int* ldc);
|
| 629 |
+
|
| 630 |
+
void dsyr2k(const char* uplo,
|
| 631 |
+
const char* trans,
|
| 632 |
+
const int* n,
|
| 633 |
+
const int* k,
|
| 634 |
+
const double* alpha,
|
| 635 |
+
const double* a,
|
| 636 |
+
const int* lda,
|
| 637 |
+
const double* b,
|
| 638 |
+
const int* ldb,
|
| 639 |
+
const double* beta,
|
| 640 |
+
double* c,
|
| 641 |
+
const int* ldc);
|
| 642 |
+
|
| 643 |
+
void csyr2k(const char* uplo,
|
| 644 |
+
const char* trans,
|
| 645 |
+
const int* n,
|
| 646 |
+
const int* k,
|
| 647 |
+
const cuComplex* alpha,
|
| 648 |
+
const cuComplex* a,
|
| 649 |
+
const int* lda,
|
| 650 |
+
const cuComplex* b,
|
| 651 |
+
const int* ldb,
|
| 652 |
+
const cuComplex* beta,
|
| 653 |
+
cuComplex* c,
|
| 654 |
+
const int* ldc);
|
| 655 |
+
|
| 656 |
+
void zsyr2k(const char* uplo,
|
| 657 |
+
const char* trans,
|
| 658 |
+
const int* n,
|
| 659 |
+
const int* k,
|
| 660 |
+
const cuDoubleComplex* alpha,
|
| 661 |
+
const cuDoubleComplex* a,
|
| 662 |
+
const int* lda,
|
| 663 |
+
const cuDoubleComplex* b,
|
| 664 |
+
const int* ldb,
|
| 665 |
+
const cuDoubleComplex* beta,
|
| 666 |
+
cuDoubleComplex* c,
|
| 667 |
+
const int* ldc);
|
| 668 |
+
|
| 669 |
+
/* HERK */
|
| 670 |
+
void cher2k_(const char* uplo,
|
| 671 |
+
const char* trans,
|
| 672 |
+
const int* n,
|
| 673 |
+
const int* k,
|
| 674 |
+
const cuComplex* alpha,
|
| 675 |
+
const cuComplex* a,
|
| 676 |
+
const int* lda,
|
| 677 |
+
const cuComplex* b,
|
| 678 |
+
const int* ldb,
|
| 679 |
+
const float* beta,
|
| 680 |
+
cuComplex* c,
|
| 681 |
+
const int* ldc);
|
| 682 |
+
|
| 683 |
+
void zher2k_(const char* uplo,
|
| 684 |
+
const char* trans,
|
| 685 |
+
const int* n,
|
| 686 |
+
const int* k,
|
| 687 |
+
const cuDoubleComplex* alpha,
|
| 688 |
+
const cuDoubleComplex* a,
|
| 689 |
+
const int* lda,
|
| 690 |
+
const cuDoubleComplex* b,
|
| 691 |
+
const int* ldb,
|
| 692 |
+
const double* beta,
|
| 693 |
+
cuDoubleComplex* c,
|
| 694 |
+
const int* ldc);
|
| 695 |
+
|
| 696 |
+
/* HER2K with no underscore */
|
| 697 |
+
void cher2k(const char* uplo,
|
| 698 |
+
const char* trans,
|
| 699 |
+
const int* n,
|
| 700 |
+
const int* k,
|
| 701 |
+
const cuComplex* alpha,
|
| 702 |
+
const cuComplex* a,
|
| 703 |
+
const int* lda,
|
| 704 |
+
const cuComplex* b,
|
| 705 |
+
const int* ldb,
|
| 706 |
+
const float* beta,
|
| 707 |
+
cuComplex* c,
|
| 708 |
+
const int* ldc);
|
| 709 |
+
|
| 710 |
+
void zher2k(const char* uplo,
|
| 711 |
+
const char* trans,
|
| 712 |
+
const int* n,
|
| 713 |
+
const int* k,
|
| 714 |
+
const cuDoubleComplex* alpha,
|
| 715 |
+
const cuDoubleComplex* a,
|
| 716 |
+
const int* lda,
|
| 717 |
+
const cuDoubleComplex* b,
|
| 718 |
+
const int* ldb,
|
| 719 |
+
const double* beta,
|
| 720 |
+
cuDoubleComplex* c,
|
| 721 |
+
const int* ldc);
|
| 722 |
+
|
| 723 |
+
/* TRMM */
|
| 724 |
+
void strmm_(const char* side,
|
| 725 |
+
const char* uplo,
|
| 726 |
+
const char* transa,
|
| 727 |
+
const char* diag,
|
| 728 |
+
const int* m,
|
| 729 |
+
const int* n,
|
| 730 |
+
const float* alpha,
|
| 731 |
+
const float* a,
|
| 732 |
+
const int* lda,
|
| 733 |
+
float* b,
|
| 734 |
+
const int* ldb);
|
| 735 |
+
|
| 736 |
+
void dtrmm_(const char* side,
|
| 737 |
+
const char* uplo,
|
| 738 |
+
const char* transa,
|
| 739 |
+
const char* diag,
|
| 740 |
+
const int* m,
|
| 741 |
+
const int* n,
|
| 742 |
+
const double* alpha,
|
| 743 |
+
const double* a,
|
| 744 |
+
const int* lda,
|
| 745 |
+
double* b,
|
| 746 |
+
const int* ldb);
|
| 747 |
+
|
| 748 |
+
void ctrmm_(const char* side,
|
| 749 |
+
const char* uplo,
|
| 750 |
+
const char* transa,
|
| 751 |
+
const char* diag,
|
| 752 |
+
const int* m,
|
| 753 |
+
const int* n,
|
| 754 |
+
const cuComplex* alpha,
|
| 755 |
+
const cuComplex* a,
|
| 756 |
+
const int* lda,
|
| 757 |
+
cuComplex* b,
|
| 758 |
+
const int* ldb);
|
| 759 |
+
|
| 760 |
+
void ztrmm_(const char* side,
|
| 761 |
+
const char* uplo,
|
| 762 |
+
const char* transa,
|
| 763 |
+
const char* diag,
|
| 764 |
+
const int* m,
|
| 765 |
+
const int* n,
|
| 766 |
+
const cuDoubleComplex* alpha,
|
| 767 |
+
const cuDoubleComplex* a,
|
| 768 |
+
const int* lda,
|
| 769 |
+
cuDoubleComplex* b,
|
| 770 |
+
const int* ldb);
|
| 771 |
+
|
| 772 |
+
void strmm(const char* side,
|
| 773 |
+
const char* uplo,
|
| 774 |
+
const char* transa,
|
| 775 |
+
const char* diag,
|
| 776 |
+
const int* m,
|
| 777 |
+
const int* n,
|
| 778 |
+
const float* alpha,
|
| 779 |
+
const float* a,
|
| 780 |
+
const int* lda,
|
| 781 |
+
float* b,
|
| 782 |
+
const int* ldb);
|
| 783 |
+
|
| 784 |
+
void dtrmm(const char* side,
|
| 785 |
+
const char* uplo,
|
| 786 |
+
const char* transa,
|
| 787 |
+
const char* diag,
|
| 788 |
+
const int* m,
|
| 789 |
+
const int* n,
|
| 790 |
+
const double* alpha,
|
| 791 |
+
const double* a,
|
| 792 |
+
const int* lda,
|
| 793 |
+
double* b,
|
| 794 |
+
const int* ldb);
|
| 795 |
+
|
| 796 |
+
void ctrmm(const char* side,
|
| 797 |
+
const char* uplo,
|
| 798 |
+
const char* transa,
|
| 799 |
+
const char* diag,
|
| 800 |
+
const int* m,
|
| 801 |
+
const int* n,
|
| 802 |
+
const cuComplex* alpha,
|
| 803 |
+
const cuComplex* a,
|
| 804 |
+
const int* lda,
|
| 805 |
+
cuComplex* b,
|
| 806 |
+
const int* ldb);
|
| 807 |
+
|
| 808 |
+
void ztrmm(const char* side,
|
| 809 |
+
const char* uplo,
|
| 810 |
+
const char* transa,
|
| 811 |
+
const char* diag,
|
| 812 |
+
const int* m,
|
| 813 |
+
const int* n,
|
| 814 |
+
const cuDoubleComplex* alpha,
|
| 815 |
+
const cuDoubleComplex* a,
|
| 816 |
+
const int* lda,
|
| 817 |
+
cuDoubleComplex* b,
|
| 818 |
+
const int* ldb);
|
| 819 |
+
|
| 820 |
+
#if defined(__cplusplus)
|
| 821 |
+
}
|
| 822 |
+
#endif /* __cplusplus */
|
| 823 |
+
|
| 824 |
+
#endif /* !defined(NVBLAS_H_) */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (226 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
ADDED
|
@@ -0,0 +1,758 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
//
|
| 4 |
+
// Copyright (c) 2014-2022, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
//
|
| 6 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
// and proprietary rights in and to this software, related documentation
|
| 8 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
// distribution of this software and related documentation without an express
|
| 10 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
//
|
| 12 |
+
// NVIDIA_COPYRIGHT_END
|
| 13 |
+
//
|
| 14 |
+
|
| 15 |
+
#ifndef __NVRTC_H__
|
| 16 |
+
#define __NVRTC_H__
|
| 17 |
+
|
| 18 |
+
#ifdef __cplusplus
|
| 19 |
+
extern "C" {
|
| 20 |
+
#endif /* __cplusplus */
|
| 21 |
+
|
| 22 |
+
#include <stdlib.h>
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
/*************************************************************************//**
|
| 26 |
+
*
|
| 27 |
+
* \defgroup error Error Handling
|
| 28 |
+
*
|
| 29 |
+
* NVRTC defines the following enumeration type and function for API call
|
| 30 |
+
* error handling.
|
| 31 |
+
*
|
| 32 |
+
****************************************************************************/
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
/**
|
| 36 |
+
* \ingroup error
|
| 37 |
+
* \brief The enumerated type nvrtcResult defines API call result codes.
|
| 38 |
+
* NVRTC API functions return nvrtcResult to indicate the call
|
| 39 |
+
* result.
|
| 40 |
+
*/
|
| 41 |
+
typedef enum {
|
| 42 |
+
NVRTC_SUCCESS = 0,
|
| 43 |
+
NVRTC_ERROR_OUT_OF_MEMORY = 1,
|
| 44 |
+
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
|
| 45 |
+
NVRTC_ERROR_INVALID_INPUT = 3,
|
| 46 |
+
NVRTC_ERROR_INVALID_PROGRAM = 4,
|
| 47 |
+
NVRTC_ERROR_INVALID_OPTION = 5,
|
| 48 |
+
NVRTC_ERROR_COMPILATION = 6,
|
| 49 |
+
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
|
| 50 |
+
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
|
| 51 |
+
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
|
| 52 |
+
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
|
| 53 |
+
NVRTC_ERROR_INTERNAL_ERROR = 11
|
| 54 |
+
} nvrtcResult;
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
* \ingroup error
|
| 59 |
+
* \brief nvrtcGetErrorString is a helper function that returns a string
|
| 60 |
+
* describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
|
| 61 |
+
* \c "NVRTC_SUCCESS".
|
| 62 |
+
* For unrecognized enumeration values, it returns
|
| 63 |
+
* \c "NVRTC_ERROR unknown".
|
| 64 |
+
*
|
| 65 |
+
* \param [in] result CUDA Runtime Compilation API result code.
|
| 66 |
+
* \return Message string for the given #nvrtcResult code.
|
| 67 |
+
*/
|
| 68 |
+
const char *nvrtcGetErrorString(nvrtcResult result);
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
/*************************************************************************//**
|
| 72 |
+
*
|
| 73 |
+
* \defgroup query General Information Query
|
| 74 |
+
*
|
| 75 |
+
* NVRTC defines the following function for general information query.
|
| 76 |
+
*
|
| 77 |
+
****************************************************************************/
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
/**
|
| 81 |
+
* \ingroup query
|
| 82 |
+
* \brief nvrtcVersion sets the output parameters \p major and \p minor
|
| 83 |
+
* with the CUDA Runtime Compilation version number.
|
| 84 |
+
*
|
| 85 |
+
* \param [out] major CUDA Runtime Compilation major version number.
|
| 86 |
+
* \param [out] minor CUDA Runtime Compilation minor version number.
|
| 87 |
+
* \return
|
| 88 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 89 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 90 |
+
*
|
| 91 |
+
*/
|
| 92 |
+
nvrtcResult nvrtcVersion(int *major, int *minor);
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
/**
|
| 96 |
+
* \ingroup query
|
| 97 |
+
* \brief nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
|
| 98 |
+
* with the number of architectures supported by NVRTC. This can
|
| 99 |
+
* then be used to pass an array to ::nvrtcGetSupportedArchs to
|
| 100 |
+
* get the supported architectures.
|
| 101 |
+
*
|
| 102 |
+
* \param [out] numArchs number of supported architectures.
|
| 103 |
+
* \return
|
| 104 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 105 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 106 |
+
*
|
| 107 |
+
* see ::nvrtcGetSupportedArchs
|
| 108 |
+
*/
|
| 109 |
+
nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
/**
|
| 113 |
+
* \ingroup query
|
| 114 |
+
* \brief nvrtcGetSupportedArchs populates the array passed via the output parameter
|
| 115 |
+
* \p supportedArchs with the architectures supported by NVRTC. The array is
|
| 116 |
+
* sorted in the ascending order. The size of the array to be passed can be
|
| 117 |
+
* determined using ::nvrtcGetNumSupportedArchs.
|
| 118 |
+
*
|
| 119 |
+
* \param [out] supportedArchs sorted array of supported architectures.
|
| 120 |
+
* \return
|
| 121 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 122 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 123 |
+
*
|
| 124 |
+
* see ::nvrtcGetNumSupportedArchs
|
| 125 |
+
*/
|
| 126 |
+
nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
/*************************************************************************//**
|
| 130 |
+
*
|
| 131 |
+
* \defgroup compilation Compilation
|
| 132 |
+
*
|
| 133 |
+
* NVRTC defines the following type and functions for actual compilation.
|
| 134 |
+
*
|
| 135 |
+
****************************************************************************/
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
/**
|
| 139 |
+
* \ingroup compilation
|
| 140 |
+
* \brief nvrtcProgram is the unit of compilation, and an opaque handle for
|
| 141 |
+
* a program.
|
| 142 |
+
*
|
| 143 |
+
* To compile a CUDA program string, an instance of nvrtcProgram must be
|
| 144 |
+
* created first with ::nvrtcCreateProgram, then compiled with
|
| 145 |
+
* ::nvrtcCompileProgram.
|
| 146 |
+
*/
|
| 147 |
+
typedef struct _nvrtcProgram *nvrtcProgram;
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
/**
|
| 151 |
+
* \ingroup compilation
|
| 152 |
+
* \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
|
| 153 |
+
* given input parameters, and sets the output parameter \p prog with
|
| 154 |
+
* it.
|
| 155 |
+
*
|
| 156 |
+
* \param [out] prog CUDA Runtime Compilation program.
|
| 157 |
+
* \param [in] src CUDA program source.
|
| 158 |
+
* \param [in] name CUDA program name.\n
|
| 159 |
+
* \p name can be \c NULL; \c "default_program" is
|
| 160 |
+
* used when \p name is \c NULL or "".
|
| 161 |
+
* \param [in] numHeaders Number of headers used.\n
|
| 162 |
+
* \p numHeaders must be greater than or equal to 0.
|
| 163 |
+
* \param [in] headers Sources of the headers.\n
|
| 164 |
+
* \p headers can be \c NULL when \p numHeaders is
|
| 165 |
+
* 0.
|
| 166 |
+
* \param [in] includeNames Name of each header by which they can be
|
| 167 |
+
* included in the CUDA program source.\n
|
| 168 |
+
* \p includeNames can be \c NULL when \p numHeaders
|
| 169 |
+
* is 0.
|
| 170 |
+
* \return
|
| 171 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 172 |
+
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
|
| 173 |
+
* - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
|
| 174 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 175 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 176 |
+
*
|
| 177 |
+
* \see ::nvrtcDestroyProgram
|
| 178 |
+
*/
|
| 179 |
+
nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
| 180 |
+
const char *src,
|
| 181 |
+
const char *name,
|
| 182 |
+
int numHeaders,
|
| 183 |
+
const char * const *headers,
|
| 184 |
+
const char * const *includeNames);
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
/**
|
| 188 |
+
* \ingroup compilation
|
| 189 |
+
* \brief nvrtcDestroyProgram destroys the given program.
|
| 190 |
+
*
|
| 191 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 192 |
+
* \return
|
| 193 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 194 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 195 |
+
*
|
| 196 |
+
* \see ::nvrtcCreateProgram
|
| 197 |
+
*/
|
| 198 |
+
nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
/**
|
| 202 |
+
* \ingroup compilation
|
| 203 |
+
* \brief nvrtcCompileProgram compiles the given program.
|
| 204 |
+
*
|
| 205 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 206 |
+
* \param [in] numOptions Number of compiler options passed.
|
| 207 |
+
* \param [in] options Compiler options in the form of C string array.\n
|
| 208 |
+
* \p options can be \c NULL when \p numOptions is 0.
|
| 209 |
+
*
|
| 210 |
+
* \return
|
| 211 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 212 |
+
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
|
| 213 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 214 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 215 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
|
| 216 |
+
* - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
|
| 217 |
+
* - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
|
| 218 |
+
*
|
| 219 |
+
* It supports compile options listed in \ref options.
|
| 220 |
+
*/
|
| 221 |
+
nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
|
| 222 |
+
int numOptions, const char * const *options);
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
/**
|
| 226 |
+
* \ingroup compilation
|
| 227 |
+
* \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
|
| 228 |
+
* generated by the previous compilation of \p prog (including the
|
| 229 |
+
* trailing \c NULL).
|
| 230 |
+
*
|
| 231 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 232 |
+
* \param [out] ptxSizeRet Size of the generated PTX (including the trailing
|
| 233 |
+
* \c NULL).
|
| 234 |
+
* \return
|
| 235 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 236 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 237 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 238 |
+
*
|
| 239 |
+
* \see ::nvrtcGetPTX
|
| 240 |
+
*/
|
| 241 |
+
nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
/**
|
| 245 |
+
* \ingroup compilation
|
| 246 |
+
* \brief nvrtcGetPTX stores the PTX generated by the previous compilation
|
| 247 |
+
* of \p prog in the memory pointed by \p ptx.
|
| 248 |
+
*
|
| 249 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 250 |
+
* \param [out] ptx Compiled result.
|
| 251 |
+
* \return
|
| 252 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 253 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 254 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 255 |
+
*
|
| 256 |
+
* \see ::nvrtcGetPTXSize
|
| 257 |
+
*/
|
| 258 |
+
nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
/**
|
| 262 |
+
* \ingroup compilation
|
| 263 |
+
* \brief nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin
|
| 264 |
+
* generated by the previous compilation of \p prog. The value of
|
| 265 |
+
* cubinSizeRet is set to 0 if the value specified to \c -arch is a
|
| 266 |
+
* virtual architecture instead of an actual architecture.
|
| 267 |
+
*
|
| 268 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 269 |
+
* \param [out] cubinSizeRet Size of the generated cubin.
|
| 270 |
+
* \return
|
| 271 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 272 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 273 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 274 |
+
*
|
| 275 |
+
* \see ::nvrtcGetCUBIN
|
| 276 |
+
*/
|
| 277 |
+
nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
/**
|
| 281 |
+
* \ingroup compilation
|
| 282 |
+
* \brief nvrtcGetCUBIN stores the cubin generated by the previous compilation
|
| 283 |
+
* of \p prog in the memory pointed by \p cubin. No cubin is available
|
| 284 |
+
* if the value specified to \c -arch is a virtual architecture instead
|
| 285 |
+
* of an actual architecture.
|
| 286 |
+
*
|
| 287 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 288 |
+
* \param [out] cubin Compiled and assembled result.
|
| 289 |
+
* \return
|
| 290 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 291 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 292 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 293 |
+
*
|
| 294 |
+
* \see ::nvrtcGetCUBINSize
|
| 295 |
+
*/
|
| 296 |
+
nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
|
| 297 |
+
|
| 298 |
+
/**
|
| 299 |
+
* \ingroup compilation
|
| 300 |
+
* \brief nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM
|
| 301 |
+
* generated by the previous compilation of \p prog. The value of
|
| 302 |
+
* nvvmSizeRet is set to 0 if the program was not compiled with
|
| 303 |
+
* \c -dlto.
|
| 304 |
+
*
|
| 305 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 306 |
+
* \param [out] nvvmSizeRet Size of the generated NVVM.
|
| 307 |
+
* \return
|
| 308 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 309 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 310 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 311 |
+
*
|
| 312 |
+
* \see ::nvrtcGetNVVM
|
| 313 |
+
*/
|
| 314 |
+
nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
/**
|
| 318 |
+
* \ingroup compilation
|
| 319 |
+
* \brief nvrtcGetNVVM stores the NVVM generated by the previous compilation
|
| 320 |
+
* of \p prog in the memory pointed by \p nvvm.
|
| 321 |
+
* The program must have been compiled with -dlto,
|
| 322 |
+
* otherwise will return an error.
|
| 323 |
+
*
|
| 324 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 325 |
+
* \param [out] nvvm Compiled result.
|
| 326 |
+
* \return
|
| 327 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 328 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 329 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 330 |
+
*
|
| 331 |
+
* \see ::nvrtcGetNVVMSize
|
| 332 |
+
*/
|
| 333 |
+
nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
|
| 334 |
+
|
| 335 |
+
/**
|
| 336 |
+
* \ingroup compilation
|
| 337 |
+
* \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
|
| 338 |
+
* log generated by the previous compilation of \p prog (including the
|
| 339 |
+
* trailing \c NULL).
|
| 340 |
+
*
|
| 341 |
+
* Note that compilation log may be generated with warnings and informative
|
| 342 |
+
* messages, even when the compilation of \p prog succeeds.
|
| 343 |
+
*
|
| 344 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 345 |
+
* \param [out] logSizeRet Size of the compilation log
|
| 346 |
+
* (including the trailing \c NULL).
|
| 347 |
+
* \return
|
| 348 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 349 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 350 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 351 |
+
*
|
| 352 |
+
* \see ::nvrtcGetProgramLog
|
| 353 |
+
*/
|
| 354 |
+
nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
/**
|
| 358 |
+
* \ingroup compilation
|
| 359 |
+
* \brief nvrtcGetProgramLog stores the log generated by the previous
|
| 360 |
+
* compilation of \p prog in the memory pointed by \p log.
|
| 361 |
+
*
|
| 362 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 363 |
+
* \param [out] log Compilation log.
|
| 364 |
+
* \return
|
| 365 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 366 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 367 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 368 |
+
*
|
| 369 |
+
* \see ::nvrtcGetProgramLogSize
|
| 370 |
+
*/
|
| 371 |
+
nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
/**
|
| 375 |
+
* \ingroup compilation
|
| 376 |
+
* \brief nvrtcAddNameExpression notes the given name expression
|
| 377 |
+
* denoting the address of a __global__ function
|
| 378 |
+
* or __device__/__constant__ variable.
|
| 379 |
+
*
|
| 380 |
+
* The identical name expression string must be provided on a subsequent
|
| 381 |
+
* call to nvrtcGetLoweredName to extract the lowered name.
|
| 382 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 383 |
+
* \param [in] name_expression constant expression denoting the address of
|
| 384 |
+
* a __global__ function or __device__/__constant__ variable.
|
| 385 |
+
* \return
|
| 386 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 387 |
+
* - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
|
| 388 |
+
*
|
| 389 |
+
* \see ::nvrtcGetLoweredName
|
| 390 |
+
*/
|
| 391 |
+
nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
|
| 392 |
+
const char * const name_expression);
|
| 393 |
+
|
| 394 |
+
/**
|
| 395 |
+
* \ingroup compilation
|
| 396 |
+
* \brief nvrtcGetLoweredName extracts the lowered (mangled) name
|
| 397 |
+
* for a __global__ function or __device__/__constant__ variable,
|
| 398 |
+
* and updates *lowered_name to point to it. The memory containing
|
| 399 |
+
* the name is released when the NVRTC program is destroyed by
|
| 400 |
+
* nvrtcDestroyProgram.
|
| 401 |
+
* The identical name expression must have been previously
|
| 402 |
+
* provided to nvrtcAddNameExpression.
|
| 403 |
+
*
|
| 404 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 405 |
+
* \param [in] name_expression constant expression denoting the address of
|
| 406 |
+
* a __global__ function or __device__/__constant__ variable.
|
| 407 |
+
* \param [out] lowered_name initialized by the function to point to a
|
| 408 |
+
* C string containing the lowered (mangled)
|
| 409 |
+
* name corresponding to the provided name expression.
|
| 410 |
+
* \return
|
| 411 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 412 |
+
* - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
|
| 413 |
+
* - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
|
| 414 |
+
*
|
| 415 |
+
* \see ::nvrtcAddNameExpression
|
| 416 |
+
*/
|
| 417 |
+
nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
|
| 418 |
+
const char *const name_expression,
|
| 419 |
+
const char** lowered_name);
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
/**
|
| 423 |
+
* \defgroup options Supported Compile Options
|
| 424 |
+
*
|
| 425 |
+
* NVRTC supports the compile options below.
|
| 426 |
+
* Option names with two preceding dashs (\c --) are long option names and
|
| 427 |
+
* option names with one preceding dash (\c -) are short option names.
|
| 428 |
+
* Short option names can be used instead of long option names.
|
| 429 |
+
* When a compile option takes an argument, an assignment operator (\c =)
|
| 430 |
+
* is used to separate the compile option argument from the compile option
|
| 431 |
+
* name, e.g., \c "--gpu-architecture=compute_60".
|
| 432 |
+
* Alternatively, the compile option name and the argument can be specified in
|
| 433 |
+
* separate strings without an assignment operator, .e.g,
|
| 434 |
+
* \c "--gpu-architecture" \c "compute_60".
|
| 435 |
+
* Single-character short option names, such as \c -D, \c -U, and \c -I, do
|
| 436 |
+
* not require an assignment operator, and the compile option name and the
|
| 437 |
+
* argument can be present in the same string with or without spaces between
|
| 438 |
+
* them.
|
| 439 |
+
* For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
|
| 440 |
+
* supported.
|
| 441 |
+
*
|
| 442 |
+
* The valid compiler options are:
|
| 443 |
+
*
|
| 444 |
+
* - Compilation targets
|
| 445 |
+
* - \c --gpu-architecture=\<arch\> (\c -arch)\n
|
| 446 |
+
* Specify the name of the class of GPU architectures for which the
|
| 447 |
+
* input must be compiled.\n
|
| 448 |
+
* - Valid <c>\<arch\></c>s:
|
| 449 |
+
* - \c compute_35
|
| 450 |
+
* - \c compute_37
|
| 451 |
+
* - \c compute_50
|
| 452 |
+
* - \c compute_52
|
| 453 |
+
* - \c compute_53
|
| 454 |
+
* - \c compute_60
|
| 455 |
+
* - \c compute_61
|
| 456 |
+
* - \c compute_62
|
| 457 |
+
* - \c compute_70
|
| 458 |
+
* - \c compute_72
|
| 459 |
+
* - \c compute_75
|
| 460 |
+
* - \c compute_80
|
| 461 |
+
* - \c compute_87
|
| 462 |
+
* - \c compute_89
|
| 463 |
+
* - \c compute_90
|
| 464 |
+
* - \c sm_35
|
| 465 |
+
* - \c sm_37
|
| 466 |
+
* - \c sm_50
|
| 467 |
+
* - \c sm_52
|
| 468 |
+
* - \c sm_53
|
| 469 |
+
* - \c sm_60
|
| 470 |
+
* - \c sm_61
|
| 471 |
+
* - \c sm_62
|
| 472 |
+
* - \c sm_70
|
| 473 |
+
* - \c sm_72
|
| 474 |
+
* - \c sm_75
|
| 475 |
+
* - \c sm_80
|
| 476 |
+
* - \c sm_87
|
| 477 |
+
* - \c sm_89
|
| 478 |
+
* - \c sm_90
|
| 479 |
+
* - Default: \c compute_52
|
| 480 |
+
* - Separate compilation / whole-program compilation
|
| 481 |
+
* - \c --device-c (\c -dc)\n
|
| 482 |
+
* Generate relocatable code that can be linked with other relocatable
|
| 483 |
+
* device code. It is equivalent to --relocatable-device-code=true.
|
| 484 |
+
* - \c --device-w (\c -dw)\n
|
| 485 |
+
* Generate non-relocatable code. It is equivalent to
|
| 486 |
+
* \c --relocatable-device-code=false.
|
| 487 |
+
* - \c --relocatable-device-code={true|false} (\c -rdc)\n
|
| 488 |
+
* Enable (disable) the generation of relocatable device code.
|
| 489 |
+
* - Default: \c false
|
| 490 |
+
* - \c --extensible-whole-program (\c -ewp)\n
|
| 491 |
+
* Do extensible whole program compilation of device code.
|
| 492 |
+
* - Default: \c false
|
| 493 |
+
* - Debugging support
|
| 494 |
+
* - \c --device-debug (\c -G)\n
|
| 495 |
+
* Generate debug information. If --dopt is not specified,
|
| 496 |
+
* then turns off all optimizations.
|
| 497 |
+
* - \c --generate-line-info (\c -lineinfo)\n
|
| 498 |
+
* Generate line-number information.
|
| 499 |
+
* - Code generation
|
| 500 |
+
* - \c --dopt on (\c -dopt)\n
|
| 501 |
+
* - \c --dopt=on \n
|
| 502 |
+
* Enable device code optimization. When specified along with '-G', enables
|
| 503 |
+
* limited debug information generation for optimized device code (currently,
|
| 504 |
+
* only line number information).
|
| 505 |
+
* When '-G' is not specified, '-dopt=on' is implicit.
|
| 506 |
+
* - \c --ptxas-options \<options\> (\c -Xptxas)\n
|
| 507 |
+
* - \c --ptxas-options=\<options\> \n
|
| 508 |
+
* Specify options directly to ptxas, the PTX optimizing assembler.
|
| 509 |
+
* - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
|
| 510 |
+
* Specify the maximum amount of registers that GPU functions can use.
|
| 511 |
+
* Until a function-specific limit, a higher value will generally
|
| 512 |
+
* increase the performance of individual GPU threads that execute this
|
| 513 |
+
* function. However, because thread registers are allocated from a
|
| 514 |
+
* global register pool on each GPU, a higher value of this option will
|
| 515 |
+
* also reduce the maximum thread block size, thereby reducing the amount
|
| 516 |
+
* of thread parallelism. Hence, a good maxrregcount value is the result
|
| 517 |
+
* of a trade-off. If this option is not specified, then no maximum is
|
| 518 |
+
* assumed. Value less than the minimum registers required by ABI will
|
| 519 |
+
* be bumped up by the compiler to ABI minimum limit.
|
| 520 |
+
* - \c --ftz={true|false} (\c -ftz)\n
|
| 521 |
+
* When performing single-precision floating-point operations, flush
|
| 522 |
+
* denormal values to zero or preserve denormal values.
|
| 523 |
+
* \c --use_fast_math implies \c --ftz=true.
|
| 524 |
+
* - Default: \c false
|
| 525 |
+
* - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
|
| 526 |
+
* For single-precision floating-point square root, use IEEE
|
| 527 |
+
* round-to-nearest mode or use a faster approximation.
|
| 528 |
+
* \c --use_fast_math implies \c --prec-sqrt=false.
|
| 529 |
+
* - Default: \c true
|
| 530 |
+
* - \c --prec-div={true|false} (\c -prec-div)\n
|
| 531 |
+
* For single-precision floating-point division and reciprocals, use IEEE
|
| 532 |
+
* round-to-nearest mode or use a faster approximation.
|
| 533 |
+
* \c --use_fast_math implies \c --prec-div=false.
|
| 534 |
+
* - Default: \c true
|
| 535 |
+
* - \c --fmad={true|false} (\c -fmad)\n
|
| 536 |
+
* Enables (disables) the contraction of floating-point multiplies and
|
| 537 |
+
* adds/subtracts into floating-point multiply-add operations (FMAD,
|
| 538 |
+
* FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
|
| 539 |
+
* - Default: \c true
|
| 540 |
+
* - \c --use_fast_math (\c -use_fast_math)\n
|
| 541 |
+
* Make use of fast math operations.
|
| 542 |
+
* \c --use_fast_math implies \c --ftz=true \c --prec-div=false
|
| 543 |
+
* \c --prec-sqrt=false \c --fmad=true.
|
| 544 |
+
* - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
|
| 545 |
+
* Enables more aggressive device code vectorization in the NVVM optimizer.
|
| 546 |
+
* - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
|
| 547 |
+
* On Linux, during compilation, use \c setrlimit() to increase stack size
|
| 548 |
+
* to maximum allowed. The limit is reset to the previous value at the
|
| 549 |
+
* end of compilation.
|
| 550 |
+
* Note: \c setrlimit() changes the value for the entire process.
|
| 551 |
+
* - Default: \c true
|
| 552 |
+
* - \c --dlink-time-opt (\c -dlto)\n
|
| 553 |
+
* Generate intermediate code for later link-time optimization.
|
| 554 |
+
* It implies \c -rdc=true.
|
| 555 |
+
* Note: when this is used the nvrtcGetNVVM API should be used,
|
| 556 |
+
* as PTX or Cubin will not be generated.
|
| 557 |
+
* - Preprocessing
|
| 558 |
+
* - \c --define-macro=\<def\> (\c -D)\n
|
| 559 |
+
* \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
|
| 560 |
+
* - \c \<name\> \n
|
| 561 |
+
* Predefine \c \<name\> as a macro with definition \c 1.
|
| 562 |
+
* - \c \<name\>=\<definition\> \n
|
| 563 |
+
* The contents of \c \<definition\> are tokenized and preprocessed
|
| 564 |
+
* as if they appeared during translation phase three in a \c \#define
|
| 565 |
+
* directive. In particular, the definition will be truncated by
|
| 566 |
+
* embedded new line characters.
|
| 567 |
+
* - \c --undefine-macro=\<def\> (\c -U)\n
|
| 568 |
+
* Cancel any previous definition of \c \<def\>.
|
| 569 |
+
* - \c --include-path=\<dir\> (\c -I)\n
|
| 570 |
+
* Add the directory \c \<dir\> to the list of directories to be
|
| 571 |
+
* searched for headers. These paths are searched after the list of
|
| 572 |
+
* headers given to ::nvrtcCreateProgram.
|
| 573 |
+
* - \c --pre-include=\<header\> (\c -include)\n
|
| 574 |
+
* Preinclude \c \<header\> during preprocessing.
|
| 575 |
+
* - \c --no-source-include (\c -no-source-include)
|
| 576 |
+
* The preprocessor by default adds the directory of each input sources
|
| 577 |
+
* to the include path. This option disables this feature and only
|
| 578 |
+
* considers the path specified explicitly.
|
| 579 |
+
* - Language Dialect
|
| 580 |
+
* - \c --std={c++03|c++11|c++14|c++17}
|
| 581 |
+
* (\c -std={c++11|c++14|c++17})\n
|
| 582 |
+
* Set language dialect to C++03, C++11, C++14 or C++17
|
| 583 |
+
* - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
|
| 584 |
+
* Provide builtin definitions of \c std::move and \c std::forward,
|
| 585 |
+
* when C++11 language dialect is selected.
|
| 586 |
+
* - Default: \c true
|
| 587 |
+
* - \c --builtin-initializer-list={true|false}
|
| 588 |
+
* (\c -builtin-initializer-list)\n
|
| 589 |
+
* Provide builtin definitions of \c std::initializer_list class and
|
| 590 |
+
* member functions when C++11 language dialect is selected.
|
| 591 |
+
* - Default: \c true
|
| 592 |
+
* - Misc.
|
| 593 |
+
* - \c --disable-warnings (\c -w)\n
|
| 594 |
+
* Inhibit all warning messages.
|
| 595 |
+
* - \c --restrict (\c -restrict)\n
|
| 596 |
+
* Programmer assertion that all kernel pointer parameters are restrict
|
| 597 |
+
* pointers.
|
| 598 |
+
* - \c --device-as-default-execution-space
|
| 599 |
+
* (\c -default-device)\n
|
| 600 |
+
* Treat entities with no execution space annotation as \c __device__
|
| 601 |
+
* entities.
|
| 602 |
+
* - \c --device-int128 (\c -device-int128)\n
|
| 603 |
+
* Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
|
| 604 |
+
* to be defined.
|
| 605 |
+
* - \c --optimization-info=\<kind\> (\c -opt-info)\n
|
| 606 |
+
* Provide optimization reports for the specified kind of optimization.
|
| 607 |
+
* The following kind tags are supported:
|
| 608 |
+
* - \c inline : emit a remark when a function is inlined.
|
| 609 |
+
* - \c --version-ident={true|false} (\c -dQ)\n
|
| 610 |
+
* Embed used compiler's version info into generated PTX/CUBIN
|
| 611 |
+
* - Default: \c false
|
| 612 |
+
* - \c --display-error-number (\c -err-no)\n
|
| 613 |
+
* Display diagnostic number for warning messages. (Default)
|
| 614 |
+
* - \c --no-display-error-number (\c -no-err-no)\n
|
| 615 |
+
* Disables the display of a diagnostic number for warning messages.
|
| 616 |
+
* - \c --diag-error=<error-number>,... (\c -diag-error)\n
|
| 617 |
+
* Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 618 |
+
* - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
|
| 619 |
+
* Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 620 |
+
* - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
|
| 621 |
+
* Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 622 |
+
*
|
| 623 |
+
*/
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
#ifdef __cplusplus
|
| 627 |
+
}
|
| 628 |
+
#endif /* __cplusplus */
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
/* The utility function 'nvrtcGetTypeName' is not available by default. Define
|
| 632 |
+
the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
|
| 633 |
+
*/
|
| 634 |
+
|
| 635 |
+
#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
|
| 636 |
+
|
| 637 |
+
#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
|
| 638 |
+
#include <cxxabi.h>
|
| 639 |
+
#include <cstdlib>
|
| 640 |
+
|
| 641 |
+
#elif defined(_WIN32)
|
| 642 |
+
#include <Windows.h>
|
| 643 |
+
#include <DbgHelp.h>
|
| 644 |
+
#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
#include <string>
|
| 648 |
+
#include <typeinfo>
|
| 649 |
+
|
| 650 |
+
template <typename T> struct __nvrtcGetTypeName_helper_t { };
|
| 651 |
+
|
| 652 |
+
/*************************************************************************//**
|
| 653 |
+
*
|
| 654 |
+
* \defgroup hosthelper Host Helper
|
| 655 |
+
*
|
| 656 |
+
* NVRTC defines the following functions for easier interaction with host code.
|
| 657 |
+
*
|
| 658 |
+
****************************************************************************/
|
| 659 |
+
|
| 660 |
+
/**
|
| 661 |
+
* \ingroup hosthelper
|
| 662 |
+
* \brief nvrtcGetTypeName stores the source level name of a type in the given
|
| 663 |
+
* std::string location.
|
| 664 |
+
*
|
| 665 |
+
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
|
| 666 |
+
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
|
| 667 |
+
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
|
| 668 |
+
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
|
| 669 |
+
* otherwise *result is initialized with the extracted name.
|
| 670 |
+
*
|
| 671 |
+
* Windows-specific notes:
|
| 672 |
+
* - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
|
| 673 |
+
* which is not multi-thread safe.
|
| 674 |
+
* - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
|
| 675 |
+
*
|
| 676 |
+
* \param [in] tinfo: reference to object of type std::type_info for a given type.
|
| 677 |
+
* \param [in] result: pointer to std::string in which to store the type name.
|
| 678 |
+
* \return
|
| 679 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 680 |
+
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
|
| 681 |
+
*
|
| 682 |
+
*/
|
| 683 |
+
inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
|
| 684 |
+
{
|
| 685 |
+
#if USE_CXXABI || __clang__ || __GNUC__
|
| 686 |
+
const char *name = tinfo.name();
|
| 687 |
+
int status;
|
| 688 |
+
char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
|
| 689 |
+
if (status == 0) {
|
| 690 |
+
*result = undecorated_name;
|
| 691 |
+
free(undecorated_name);
|
| 692 |
+
return NVRTC_SUCCESS;
|
| 693 |
+
}
|
| 694 |
+
#elif defined(_WIN32)
|
| 695 |
+
const char *name = tinfo.raw_name();
|
| 696 |
+
if (!name || *name != '.') {
|
| 697 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 698 |
+
}
|
| 699 |
+
char undecorated_name[4096];
|
| 700 |
+
//name+1 skips over the '.' prefix
|
| 701 |
+
if(UnDecorateSymbolName(name+1, undecorated_name,
|
| 702 |
+
sizeof(undecorated_name) / sizeof(*undecorated_name),
|
| 703 |
+
//note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
|
| 704 |
+
UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
|
| 705 |
+
*result = undecorated_name;
|
| 706 |
+
return NVRTC_SUCCESS;
|
| 707 |
+
}
|
| 708 |
+
#endif /* USE_CXXABI || __clang__ || __GNUC__ */
|
| 709 |
+
|
| 710 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
/**
|
| 714 |
+
* \ingroup hosthelper
|
| 715 |
+
* \brief nvrtcGetTypeName stores the source level name of the template type argument
|
| 716 |
+
* T in the given std::string location.
|
| 717 |
+
*
|
| 718 |
+
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
|
| 719 |
+
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
|
| 720 |
+
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
|
| 721 |
+
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
|
| 722 |
+
* otherwise *result is initialized with the extracted name.
|
| 723 |
+
*
|
| 724 |
+
* Windows-specific notes:
|
| 725 |
+
* - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
|
| 726 |
+
* which is not multi-thread safe.
|
| 727 |
+
* - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
|
| 728 |
+
*
|
| 729 |
+
* \param [in] result: pointer to std::string in which to store the type name.
|
| 730 |
+
* \return
|
| 731 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 732 |
+
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
|
| 733 |
+
*
|
| 734 |
+
*/
|
| 735 |
+
|
| 736 |
+
template <typename T>
|
| 737 |
+
nvrtcResult nvrtcGetTypeName(std::string *result)
|
| 738 |
+
{
|
| 739 |
+
nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
|
| 740 |
+
result);
|
| 741 |
+
if (res != NVRTC_SUCCESS)
|
| 742 |
+
return res;
|
| 743 |
+
|
| 744 |
+
std::string repr = *result;
|
| 745 |
+
std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
|
| 746 |
+
idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
|
| 747 |
+
std::size_t last_idx = repr.find_last_of('>');
|
| 748 |
+
if (idx == std::string::npos || last_idx == std::string::npos) {
|
| 749 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 750 |
+
}
|
| 751 |
+
++idx;
|
| 752 |
+
*result = repr.substr(idx, last_idx - idx);
|
| 753 |
+
return NVRTC_SUCCESS;
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
#endif /* NVRTC_GET_TYPE_NAME */
|
| 757 |
+
|
| 758 |
+
#endif /* __NVRTC_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (222 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(CU_COMPLEX_H_)
|
| 51 |
+
#define CU_COMPLEX_H_
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDACC_RTC__)
|
| 54 |
+
#if defined(__GNUC__)
|
| 55 |
+
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
|
| 56 |
+
#pragma GCC diagnostic ignored "-Wunused-function"
|
| 57 |
+
#endif
|
| 58 |
+
#endif
|
| 59 |
+
#endif
|
| 60 |
+
|
| 61 |
+
/* When trying to include C header file in C++ Code extern "C" is required
|
| 62 |
+
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
|
| 63 |
+
* extern "C" cannot be nested
|
| 64 |
+
* Hence keep the header out of extern "C" block
|
| 65 |
+
*/
|
| 66 |
+
|
| 67 |
+
#if !defined(__CUDACC__)
|
| 68 |
+
#include <math.h> /* import fabsf, sqrt */
|
| 69 |
+
#endif /* !defined(__CUDACC__) */
|
| 70 |
+
|
| 71 |
+
#if defined(__cplusplus)
|
| 72 |
+
extern "C" {
|
| 73 |
+
#endif /* __cplusplus */
|
| 74 |
+
|
| 75 |
+
#include "vector_types.h"
|
| 76 |
+
|
| 77 |
+
typedef float2 cuFloatComplex;
|
| 78 |
+
|
| 79 |
+
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
|
| 80 |
+
{
|
| 81 |
+
return x.x;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
|
| 85 |
+
{
|
| 86 |
+
return x.y;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
|
| 90 |
+
(float r, float i)
|
| 91 |
+
{
|
| 92 |
+
cuFloatComplex res;
|
| 93 |
+
res.x = r;
|
| 94 |
+
res.y = i;
|
| 95 |
+
return res;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
|
| 99 |
+
{
|
| 100 |
+
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
|
| 101 |
+
}
|
| 102 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
|
| 103 |
+
cuFloatComplex y)
|
| 104 |
+
{
|
| 105 |
+
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
|
| 106 |
+
cuCimagf(x) + cuCimagf(y));
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
|
| 110 |
+
cuFloatComplex y)
|
| 111 |
+
{
|
| 112 |
+
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
|
| 113 |
+
cuCimagf(x) - cuCimagf(y));
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* This implementation could suffer from intermediate overflow even though
|
| 117 |
+
* the final result would be in range. However, various implementations do
|
| 118 |
+
* not guard against this (presumably to avoid losing performance), so we
|
| 119 |
+
* don't do it either to stay competitive.
|
| 120 |
+
*/
|
| 121 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
|
| 122 |
+
cuFloatComplex y)
|
| 123 |
+
{
|
| 124 |
+
cuFloatComplex prod;
|
| 125 |
+
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
|
| 126 |
+
(cuCimagf(x) * cuCimagf(y)),
|
| 127 |
+
(cuCrealf(x) * cuCimagf(y)) +
|
| 128 |
+
(cuCimagf(x) * cuCrealf(y)));
|
| 129 |
+
return prod;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 133 |
+
* by scaling. Such guarded implementations are usually the default for
|
| 134 |
+
* complex library implementations, with some also offering an unguarded,
|
| 135 |
+
* faster version.
|
| 136 |
+
*/
|
| 137 |
+
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
|
| 138 |
+
cuFloatComplex y)
|
| 139 |
+
{
|
| 140 |
+
cuFloatComplex quot;
|
| 141 |
+
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
|
| 142 |
+
float oos = 1.0f / s;
|
| 143 |
+
float ars = cuCrealf(x) * oos;
|
| 144 |
+
float ais = cuCimagf(x) * oos;
|
| 145 |
+
float brs = cuCrealf(y) * oos;
|
| 146 |
+
float bis = cuCimagf(y) * oos;
|
| 147 |
+
s = (brs * brs) + (bis * bis);
|
| 148 |
+
oos = 1.0f / s;
|
| 149 |
+
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
|
| 150 |
+
((ais * brs) - (ars * bis)) * oos);
|
| 151 |
+
return quot;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
/*
|
| 155 |
+
* We would like to call hypotf(), but it's not available on all platforms.
|
| 156 |
+
* This discrete implementation guards against intermediate underflow and
|
| 157 |
+
* overflow by scaling. Otherwise we would lose half the exponent range.
|
| 158 |
+
* There are various ways of doing guarded computation. For now chose the
|
| 159 |
+
* simplest and fastest solution, however this may suffer from inaccuracies
|
| 160 |
+
* if sqrt and division are not IEEE compliant.
|
| 161 |
+
*/
|
| 162 |
+
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
|
| 163 |
+
{
|
| 164 |
+
float a = cuCrealf(x);
|
| 165 |
+
float b = cuCimagf(x);
|
| 166 |
+
float v, w, t;
|
| 167 |
+
a = fabsf(a);
|
| 168 |
+
b = fabsf(b);
|
| 169 |
+
if (a > b) {
|
| 170 |
+
v = a;
|
| 171 |
+
w = b;
|
| 172 |
+
} else {
|
| 173 |
+
v = b;
|
| 174 |
+
w = a;
|
| 175 |
+
}
|
| 176 |
+
t = w / v;
|
| 177 |
+
t = 1.0f + t * t;
|
| 178 |
+
t = v * sqrtf(t);
|
| 179 |
+
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
|
| 180 |
+
t = v + w;
|
| 181 |
+
}
|
| 182 |
+
return t;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
/* Double precision */
|
| 186 |
+
typedef double2 cuDoubleComplex;
|
| 187 |
+
|
| 188 |
+
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
|
| 189 |
+
{
|
| 190 |
+
return x.x;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
|
| 194 |
+
{
|
| 195 |
+
return x.y;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
|
| 199 |
+
(double r, double i)
|
| 200 |
+
{
|
| 201 |
+
cuDoubleComplex res;
|
| 202 |
+
res.x = r;
|
| 203 |
+
res.y = i;
|
| 204 |
+
return res;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
|
| 208 |
+
{
|
| 209 |
+
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
|
| 213 |
+
cuDoubleComplex y)
|
| 214 |
+
{
|
| 215 |
+
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
|
| 216 |
+
cuCimag(x) + cuCimag(y));
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
|
| 220 |
+
cuDoubleComplex y)
|
| 221 |
+
{
|
| 222 |
+
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
|
| 223 |
+
cuCimag(x) - cuCimag(y));
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
/* This implementation could suffer from intermediate overflow even though
|
| 227 |
+
* the final result would be in range. However, various implementations do
|
| 228 |
+
* not guard against this (presumably to avoid losing performance), so we
|
| 229 |
+
* don't do it either to stay competitive.
|
| 230 |
+
*/
|
| 231 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
|
| 232 |
+
cuDoubleComplex y)
|
| 233 |
+
{
|
| 234 |
+
cuDoubleComplex prod;
|
| 235 |
+
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
|
| 236 |
+
(cuCimag(x) * cuCimag(y)),
|
| 237 |
+
(cuCreal(x) * cuCimag(y)) +
|
| 238 |
+
(cuCimag(x) * cuCreal(y)));
|
| 239 |
+
return prod;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 243 |
+
* by scaling. Such guarded implementations are usually the default for
|
| 244 |
+
* complex library implementations, with some also offering an unguarded,
|
| 245 |
+
* faster version.
|
| 246 |
+
*/
|
| 247 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
|
| 248 |
+
cuDoubleComplex y)
|
| 249 |
+
{
|
| 250 |
+
cuDoubleComplex quot;
|
| 251 |
+
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
|
| 252 |
+
double oos = 1.0 / s;
|
| 253 |
+
double ars = cuCreal(x) * oos;
|
| 254 |
+
double ais = cuCimag(x) * oos;
|
| 255 |
+
double brs = cuCreal(y) * oos;
|
| 256 |
+
double bis = cuCimag(y) * oos;
|
| 257 |
+
s = (brs * brs) + (bis * bis);
|
| 258 |
+
oos = 1.0 / s;
|
| 259 |
+
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
|
| 260 |
+
((ais * brs) - (ars * bis)) * oos);
|
| 261 |
+
return quot;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
/* This implementation guards against intermediate underflow and overflow
|
| 265 |
+
* by scaling. Otherwise we would lose half the exponent range. There are
|
| 266 |
+
* various ways of doing guarded computation. For now chose the simplest
|
| 267 |
+
* and fastest solution, however this may suffer from inaccuracies if sqrt
|
| 268 |
+
* and division are not IEEE compliant.
|
| 269 |
+
*/
|
| 270 |
+
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
|
| 271 |
+
{
|
| 272 |
+
double a = cuCreal(x);
|
| 273 |
+
double b = cuCimag(x);
|
| 274 |
+
double v, w, t;
|
| 275 |
+
a = fabs(a);
|
| 276 |
+
b = fabs(b);
|
| 277 |
+
if (a > b) {
|
| 278 |
+
v = a;
|
| 279 |
+
w = b;
|
| 280 |
+
} else {
|
| 281 |
+
v = b;
|
| 282 |
+
w = a;
|
| 283 |
+
}
|
| 284 |
+
t = w / v;
|
| 285 |
+
t = 1.0 + t * t;
|
| 286 |
+
t = v * sqrt(t);
|
| 287 |
+
if ((v == 0.0) ||
|
| 288 |
+
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
|
| 289 |
+
t = v + w;
|
| 290 |
+
}
|
| 291 |
+
return t;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
#if defined(__cplusplus)
|
| 295 |
+
}
|
| 296 |
+
#endif /* __cplusplus */
|
| 297 |
+
|
| 298 |
+
/* aliases */
|
| 299 |
+
typedef cuFloatComplex cuComplex;
|
| 300 |
+
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
|
| 301 |
+
float y)
|
| 302 |
+
{
|
| 303 |
+
return make_cuFloatComplex (x, y);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
/* float-to-double promotion */
|
| 307 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
|
| 308 |
+
(cuFloatComplex c)
|
| 309 |
+
{
|
| 310 |
+
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
|
| 314 |
+
(cuDoubleComplex c)
|
| 315 |
+
{
|
| 316 |
+
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
|
| 321 |
+
{
|
| 322 |
+
float real_res;
|
| 323 |
+
float imag_res;
|
| 324 |
+
|
| 325 |
+
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
|
| 326 |
+
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
|
| 327 |
+
|
| 328 |
+
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
|
| 329 |
+
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
|
| 330 |
+
|
| 331 |
+
return make_cuComplex(real_res, imag_res);
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
|
| 335 |
+
{
|
| 336 |
+
double real_res;
|
| 337 |
+
double imag_res;
|
| 338 |
+
|
| 339 |
+
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
|
| 340 |
+
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
|
| 341 |
+
|
| 342 |
+
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
|
| 343 |
+
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
|
| 344 |
+
|
| 345 |
+
return make_cuDoubleComplex(real_res, imag_res);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
#endif /* !defined(CU_COMPLEX_H_) */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_H_
|
| 51 |
+
# define _CUDA_AWBARRIER_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_awbarrier_primitives.h"
|
| 54 |
+
|
| 55 |
+
# if !defined(_CUDA_AWBARRIER_SM_TARGET)
|
| 56 |
+
# error This file requires compute capability 7.0 or greater.
|
| 57 |
+
# endif
|
| 58 |
+
|
| 59 |
+
# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
|
| 60 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 61 |
+
-std=c++11 compiler option.
|
| 62 |
+
# endif
|
| 63 |
+
|
| 64 |
+
_CUDA_AWBARRIER_BEGIN_NAMESPACE
|
| 65 |
+
|
| 66 |
+
class awbarrier {
|
| 67 |
+
public:
|
| 68 |
+
class arrival_token {
|
| 69 |
+
public:
|
| 70 |
+
arrival_token() = default;
|
| 71 |
+
~arrival_token() = default;
|
| 72 |
+
_CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
|
| 73 |
+
private:
|
| 74 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
|
| 75 |
+
uint64_t token;
|
| 76 |
+
friend awbarrier;
|
| 77 |
+
};
|
| 78 |
+
awbarrier() = default;
|
| 79 |
+
awbarrier(const awbarrier&) = delete;
|
| 80 |
+
awbarrier& operator=(const awbarrier&) = delete;
|
| 81 |
+
~awbarrier() = default;
|
| 82 |
+
|
| 83 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
|
| 84 |
+
_CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
|
| 85 |
+
_CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
|
| 86 |
+
_CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
|
| 87 |
+
_CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
|
| 88 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
|
| 89 |
+
private:
|
| 90 |
+
uint64_t barrier;
|
| 91 |
+
friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
|
| 92 |
+
friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
|
| 93 |
+
friend class pipeline;
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 97 |
+
uint32_t awbarrier::arrival_token::pending_count() const
|
| 98 |
+
{
|
| 99 |
+
const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
|
| 100 |
+
#if (__CUDA_ARCH__ >= 900)
|
| 101 |
+
return pending_count;
|
| 102 |
+
#else
|
| 103 |
+
return (pending_count >> 15);
|
| 104 |
+
#endif
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 108 |
+
awbarrier::arrival_token::arrival_token(uint64_t token)
|
| 109 |
+
: token(token)
|
| 110 |
+
{
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 114 |
+
void init(awbarrier* barrier, uint32_t expected_count)
|
| 115 |
+
{
|
| 116 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 117 |
+
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
|
| 118 |
+
|
| 119 |
+
#if (__CUDA_ARCH__ >= 900)
|
| 120 |
+
const uint32_t init_count = expected_count;
|
| 121 |
+
#else
|
| 122 |
+
const uint32_t init_count = (expected_count << 15) + expected_count;
|
| 123 |
+
#endif
|
| 124 |
+
|
| 125 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 129 |
+
void inval(awbarrier* barrier)
|
| 130 |
+
{
|
| 131 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 132 |
+
|
| 133 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 137 |
+
awbarrier::arrival_token awbarrier::arrive()
|
| 138 |
+
{
|
| 139 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 140 |
+
|
| 141 |
+
#if (__CUDA_ARCH__ < 900)
|
| 142 |
+
const uint32_t arrive_count = 1 << 15;
|
| 143 |
+
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
|
| 144 |
+
(void)
|
| 145 |
+
#else
|
| 146 |
+
const uint64_t token =
|
| 147 |
+
#endif
|
| 148 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
|
| 149 |
+
|
| 150 |
+
return arrival_token(token);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 154 |
+
awbarrier::arrival_token awbarrier::arrive_and_drop()
|
| 155 |
+
{
|
| 156 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 157 |
+
|
| 158 |
+
#if (__CUDA_ARCH__ < 900)
|
| 159 |
+
const uint32_t arrive_count = 1 << 15;
|
| 160 |
+
const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
|
| 161 |
+
(void)
|
| 162 |
+
#else
|
| 163 |
+
const uint64_t token =
|
| 164 |
+
#endif
|
| 165 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
|
| 166 |
+
|
| 167 |
+
return arrival_token(token);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 171 |
+
bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
|
| 172 |
+
{
|
| 173 |
+
constexpr uint64_t max_busy_wait_cycles = 1024;
|
| 174 |
+
constexpr uint32_t max_sleep_ns = 1 << 20;
|
| 175 |
+
|
| 176 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 177 |
+
|
| 178 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
|
| 179 |
+
return true;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
uint64_t start_cycles = clock64();
|
| 183 |
+
uint64_t elapsed_cycles = 0;
|
| 184 |
+
uint32_t sleep_ns = 32;
|
| 185 |
+
while (elapsed_cycles < hint_cycles) {
|
| 186 |
+
if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
|
| 187 |
+
return true;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if (elapsed_cycles > max_busy_wait_cycles) {
|
| 191 |
+
__nanosleep(sleep_ns);
|
| 192 |
+
if (sleep_ns < max_sleep_ns) {
|
| 193 |
+
sleep_ns *= 2;
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
elapsed_cycles = clock64() - start_cycles;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
return false;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 204 |
+
void awbarrier::wait(arrival_token token)
|
| 205 |
+
{
|
| 206 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 207 |
+
|
| 208 |
+
while (!timed_wait(token, ~0u));
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 212 |
+
void awbarrier::arrive_and_wait()
|
| 213 |
+
{
|
| 214 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
|
| 215 |
+
|
| 216 |
+
this->wait(this->arrive());
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
_CUDA_AWBARRIER_QUALIFIER __host__
|
| 220 |
+
constexpr uint32_t awbarrier::max()
|
| 221 |
+
{
|
| 222 |
+
return _CUDA_AWBARRIER_MAX_COUNT;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
_CUDA_AWBARRIER_END_NAMESPACE
|
| 226 |
+
|
| 227 |
+
#endif /* !_CUDA_AWBARRIER_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_HELPERS_H_
|
| 51 |
+
#define _CUDA_AWBARRIER_HELPERS_H_
|
| 52 |
+
|
| 53 |
+
#define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental
|
| 54 |
+
#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
|
| 55 |
+
#define _CUDA_AWBARRIER_END_NAMESPACE } }
|
| 56 |
+
|
| 57 |
+
#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
|
| 58 |
+
#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
|
| 59 |
+
#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE
|
| 60 |
+
|
| 61 |
+
# if !defined(_CUDA_AWBARRIER_QUALIFIER)
|
| 62 |
+
# define _CUDA_AWBARRIER_QUALIFIER inline __device__
|
| 63 |
+
# endif
|
| 64 |
+
# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
|
| 65 |
+
# define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
|
| 66 |
+
#endif
|
| 67 |
+
|
| 68 |
+
#if defined(__CUDA_ARCH__)
|
| 69 |
+
#if (__CUDA_ARCH__ >= 800)
|
| 70 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
|
| 71 |
+
#elif (__CUDA_ARCH__ >= 700)
|
| 72 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
|
| 73 |
+
#endif // No support < 700
|
| 74 |
+
#else // !defined(__CUDA_ARCH__)
|
| 75 |
+
# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
|
| 76 |
+
#endif // defined(__CUDA_ARCH__)
|
| 77 |
+
|
| 78 |
+
#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
|
| 79 |
+
|
| 80 |
+
#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
|
| 81 |
+
# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
|
| 82 |
+
#endif
|
| 83 |
+
|
| 84 |
+
#if !defined(_CUDA_AWBARRIER_DEBUG)
|
| 85 |
+
# if defined(__CUDACC_DEBUG__)
|
| 86 |
+
# define _CUDA_AWBARRIER_DEBUG 1
|
| 87 |
+
# else
|
| 88 |
+
# define _CUDA_AWBARRIER_DEBUG 0
|
| 89 |
+
# endif
|
| 90 |
+
#endif
|
| 91 |
+
|
| 92 |
+
#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
|
| 93 |
+
# if !defined(__CUDACC_RTC__)
|
| 94 |
+
# include <cassert>
|
| 95 |
+
# endif
|
| 96 |
+
# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
|
| 97 |
+
# define _CUDA_AWBARRIER_ABORT() assert(0);
|
| 98 |
+
#else
|
| 99 |
+
# define _CUDA_AWBARRIER_ASSERT(x)
|
| 100 |
+
# define _CUDA_AWBARRIER_ABORT() __trap();
|
| 101 |
+
#endif
|
| 102 |
+
|
| 103 |
+
#if defined(__CUDACC_RTC__)
|
| 104 |
+
typedef unsigned short uint16_t;
|
| 105 |
+
typedef unsigned int uint32_t;
|
| 106 |
+
typedef unsigned long long uint64_t;
|
| 107 |
+
typedef uint64_t uintptr_t;
|
| 108 |
+
#else
|
| 109 |
+
# include <stdint.h>
|
| 110 |
+
#endif
|
| 111 |
+
|
| 112 |
+
#if defined(_CUDA_AWBARRIER_SM_TARGET)
|
| 113 |
+
|
| 114 |
+
typedef uint64_t __mbarrier_t;
|
| 115 |
+
typedef uint64_t __mbarrier_token_t;
|
| 116 |
+
|
| 117 |
+
_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
|
| 118 |
+
|
| 119 |
+
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
|
| 120 |
+
|
| 121 |
+
namespace _CUDA_AWBARRIER_SM_70 {
|
| 122 |
+
union AWBarrier {
|
| 123 |
+
struct {
|
| 124 |
+
uint32_t expected;
|
| 125 |
+
uint32_t pending;
|
| 126 |
+
} split;
|
| 127 |
+
uint64_t raw;
|
| 128 |
+
};
|
| 129 |
+
|
| 130 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 131 |
+
void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
|
| 132 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 133 |
+
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
|
| 134 |
+
|
| 135 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 136 |
+
|
| 137 |
+
awbarrier->split.expected = 0x40000000 - expected_count;
|
| 138 |
+
awbarrier->split.pending = 0x80000000 - expected_count;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 142 |
+
void __awbarrier_inval(uint64_t* barrier) {
|
| 143 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 147 |
+
uint32_t __awbarrier_token_pending_count(uint64_t token) {
|
| 148 |
+
const uint32_t pending = token >> 32;
|
| 149 |
+
return 0x80000000 - (pending & 0x7fffffff);
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
template<bool _Drop>
|
| 153 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 154 |
+
uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
|
| 155 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 156 |
+
|
| 157 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 158 |
+
|
| 159 |
+
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
|
| 160 |
+
|
| 161 |
+
if (_Drop) {
|
| 162 |
+
(void)atomicAdd_block(&awbarrier->split.expected, 1);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
__threadfence_block();
|
| 166 |
+
|
| 167 |
+
const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
|
| 168 |
+
const uint32_t new_pending = old_pending + 1;
|
| 169 |
+
const bool reset = (old_pending ^ new_pending) & 0x80000000;
|
| 170 |
+
|
| 171 |
+
if (reset) {
|
| 172 |
+
__threadfence_block();
|
| 173 |
+
|
| 174 |
+
uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
|
| 175 |
+
new_expected &= ~0x40000000;
|
| 176 |
+
if (new_expected & 0x20000000) {
|
| 177 |
+
new_expected |= 0x40000000;
|
| 178 |
+
}
|
| 179 |
+
atomicAdd_block(&awbarrier->split.pending, new_expected);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
return static_cast<uint64_t>(old_pending) << 32;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
template<bool _Drop>
|
| 186 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 187 |
+
uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
|
| 188 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 189 |
+
_CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
|
| 190 |
+
|
| 191 |
+
AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
|
| 192 |
+
|
| 193 |
+
while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
|
| 194 |
+
|
| 195 |
+
if (_Drop) {
|
| 196 |
+
(void)atomicAdd_block(&awbarrier->split.expected, count);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 203 |
+
bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
|
| 204 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 205 |
+
|
| 206 |
+
volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
|
| 207 |
+
|
| 208 |
+
return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
|
| 209 |
+
}
|
| 210 |
+
}; // namespace _CUDA_AWBARRIER_SM_70
|
| 211 |
+
|
| 212 |
+
namespace _CUDA_AWBARRIER_SM_80 {
|
| 213 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 214 |
+
void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
|
| 215 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 216 |
+
_CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
|
| 217 |
+
|
| 218 |
+
asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
|
| 219 |
+
:
|
| 220 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
|
| 221 |
+
: "memory");
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 225 |
+
void __awbarrier_inval(uint64_t* barrier) {
|
| 226 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 227 |
+
|
| 228 |
+
asm volatile ("mbarrier.inval.shared.b64 [%0];"
|
| 229 |
+
:
|
| 230 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 231 |
+
: "memory");
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 235 |
+
uint32_t __awbarrier_token_pending_count(uint64_t token) {
|
| 236 |
+
uint32_t __pending_count;
|
| 237 |
+
|
| 238 |
+
asm ("mbarrier.pending_count.b64 %0, %1;"
|
| 239 |
+
: "=r"(__pending_count)
|
| 240 |
+
: "l"(token));
|
| 241 |
+
return __pending_count;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
template<bool _Drop>
|
| 245 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 246 |
+
uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
|
| 247 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 248 |
+
|
| 249 |
+
uint64_t token;
|
| 250 |
+
|
| 251 |
+
if (_Drop) {
|
| 252 |
+
asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
|
| 253 |
+
: "=l"(token)
|
| 254 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 255 |
+
: "memory");
|
| 256 |
+
} else {
|
| 257 |
+
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
|
| 258 |
+
: "=l"(token)
|
| 259 |
+
: "r"(__nvvm_get_smem_pointer(barrier))
|
| 260 |
+
: "memory");
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
return token;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
template<bool _Drop>
|
| 267 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 268 |
+
uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
|
| 269 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 270 |
+
_CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
|
| 271 |
+
|
| 272 |
+
uint64_t token;
|
| 273 |
+
|
| 274 |
+
if (_Drop) {
|
| 275 |
+
asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
|
| 276 |
+
: "=l"(token)
|
| 277 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
|
| 278 |
+
: "memory");
|
| 279 |
+
} else {
|
| 280 |
+
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
|
| 281 |
+
: "=l"(token)
|
| 282 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
|
| 283 |
+
: "memory");
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
return token;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 290 |
+
bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
|
| 291 |
+
_CUDA_AWBARRIER_ASSERT(__isShared(barrier));
|
| 292 |
+
|
| 293 |
+
uint16_t __wait_complete;
|
| 294 |
+
|
| 295 |
+
asm volatile ("{"
|
| 296 |
+
" .reg .pred %%p;"
|
| 297 |
+
" mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
|
| 298 |
+
" selp.u16 %0, 1, 0, %%p;"
|
| 299 |
+
"}"
|
| 300 |
+
: "=h"(__wait_complete)
|
| 301 |
+
: "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
|
| 302 |
+
: "memory");
|
| 303 |
+
return bool(__wait_complete);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
}; // namespace _CUDA_AWBARRIER_SM_80
|
| 307 |
+
|
| 308 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 309 |
+
void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
|
| 310 |
+
{
|
| 311 |
+
_CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 315 |
+
void awbarrier_inval(uint64_t* barrier)
|
| 316 |
+
{
|
| 317 |
+
_CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier);
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 321 |
+
uint32_t awbarrier_token_pending_count(uint64_t token)
|
| 322 |
+
{
|
| 323 |
+
return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token);
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
template<bool _Drop>
|
| 327 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 328 |
+
uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
|
| 329 |
+
{
|
| 330 |
+
return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count);
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
template<bool _Drop>
|
| 334 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 335 |
+
uint64_t awbarrier_arrive_drop(uint64_t* barrier)
|
| 336 |
+
{
|
| 337 |
+
return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier);
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
_CUDA_AWBARRIER_QUALIFIER
|
| 341 |
+
bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
|
| 342 |
+
{
|
| 343 |
+
return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token);
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
|
| 347 |
+
|
| 348 |
+
#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */
|
| 349 |
+
|
| 350 |
+
#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
|
| 51 |
+
#define _CUDA_AWBARRIER_PRIMITIVES_H_
|
| 52 |
+
|
| 53 |
+
#include "cuda_awbarrier_helpers.h"
|
| 54 |
+
|
| 55 |
+
#if !defined(_CUDA_AWBARRIER_SM_TARGET)
|
| 56 |
+
# error This file requires compute capability 7.0 or greater.
|
| 57 |
+
#endif
|
| 58 |
+
|
| 59 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
|
| 60 |
+
uint32_t __mbarrier_maximum_count() {
|
| 61 |
+
return _CUDA_AWBARRIER_MAX_COUNT;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 65 |
+
void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
|
| 66 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 70 |
+
void __mbarrier_inval(__mbarrier_t* barrier) {
|
| 71 |
+
_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 75 |
+
__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
|
| 76 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 80 |
+
__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
|
| 81 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 85 |
+
bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
|
| 86 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
_CUDA_AWBARRIER_STATIC_QUALIFIER
|
| 90 |
+
uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
|
| 91 |
+
return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h
ADDED
|
@@ -0,0 +1,1958 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/**
|
| 51 |
+
* CUDA Occupancy Calculator
|
| 52 |
+
*
|
| 53 |
+
* NAME
|
| 54 |
+
*
|
| 55 |
+
* cudaOccMaxActiveBlocksPerMultiprocessor,
|
| 56 |
+
* cudaOccMaxPotentialOccupancyBlockSize,
|
| 57 |
+
* cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
|
| 58 |
+
* cudaOccAvailableDynamicSMemPerBlock
|
| 59 |
+
*
|
| 60 |
+
* DESCRIPTION
|
| 61 |
+
*
|
| 62 |
+
* The CUDA occupancy calculator provides a standalone, programmatical
|
| 63 |
+
* interface to compute the occupancy of a function on a device. It can also
|
| 64 |
+
* provide occupancy-oriented launch configuration suggestions.
|
| 65 |
+
*
|
| 66 |
+
* The function and device are defined by the user through
|
| 67 |
+
* cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
|
| 68 |
+
* structures. All APIs require all 3 of them.
|
| 69 |
+
*
|
| 70 |
+
* See the structure definition for more details about the device / function
|
| 71 |
+
* descriptors.
|
| 72 |
+
*
|
| 73 |
+
* See each API's prototype for API usage.
|
| 74 |
+
*
|
| 75 |
+
* COMPATIBILITY
|
| 76 |
+
*
|
| 77 |
+
* The occupancy calculator will be updated on each major CUDA toolkit
|
| 78 |
+
* release. It does not provide forward compatibility, i.e. new hardwares
|
| 79 |
+
* released after this implementation's release will not be supported.
|
| 80 |
+
*
|
| 81 |
+
* NOTE
|
| 82 |
+
*
|
| 83 |
+
* If there is access to CUDA runtime, and the sole intent is to calculate
|
| 84 |
+
* occupancy related values on one of the accessible CUDA devices, using CUDA
|
| 85 |
+
* runtime's occupancy calculation APIs is recommended.
|
| 86 |
+
*
|
| 87 |
+
*/
|
| 88 |
+
|
| 89 |
+
#ifndef __cuda_occupancy_h__
|
| 90 |
+
#define __cuda_occupancy_h__
|
| 91 |
+
|
| 92 |
+
#include <stddef.h>
|
| 93 |
+
#include <limits.h>
|
| 94 |
+
#include <string.h>
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
// __OCC_INLINE will be undefined at the end of this header
|
| 98 |
+
//
|
| 99 |
+
#ifdef __CUDACC__
|
| 100 |
+
#define __OCC_INLINE inline __host__ __device__
|
| 101 |
+
#elif defined _MSC_VER
|
| 102 |
+
#define __OCC_INLINE __inline
|
| 103 |
+
#else // GNUCC assumed
|
| 104 |
+
#define __OCC_INLINE inline
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
enum cudaOccError_enum {
|
| 108 |
+
CUDA_OCC_SUCCESS = 0, // no error encountered
|
| 109 |
+
CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid
|
| 110 |
+
CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in
|
| 111 |
+
// current implementation or device is
|
| 112 |
+
// invalid
|
| 113 |
+
};
|
| 114 |
+
typedef enum cudaOccError_enum cudaOccError;
|
| 115 |
+
|
| 116 |
+
typedef struct cudaOccResult cudaOccResult;
|
| 117 |
+
typedef struct cudaOccDeviceProp cudaOccDeviceProp;
|
| 118 |
+
typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
|
| 119 |
+
typedef struct cudaOccDeviceState cudaOccDeviceState;
|
| 120 |
+
|
| 121 |
+
/**
|
| 122 |
+
* The CUDA occupancy calculator computes the occupancy of the function
|
| 123 |
+
* described by attributes with the given block size (blockSize), static device
|
| 124 |
+
* properties (properties), dynamic device states (states) and per-block dynamic
|
| 125 |
+
* shared memory allocation (dynamicSMemSize) in bytes, and output it through
|
| 126 |
+
* result along with other useful information. The occupancy is computed in
|
| 127 |
+
* terms of the maximum number of active blocks per multiprocessor. The user can
|
| 128 |
+
* then convert it to other metrics, such as number of active warps.
|
| 129 |
+
*
|
| 130 |
+
* RETURN VALUE
|
| 131 |
+
*
|
| 132 |
+
* The occupancy and related information is returned through result.
|
| 133 |
+
*
|
| 134 |
+
* If result->activeBlocksPerMultiprocessor is 0, then the given parameter
|
| 135 |
+
* combination cannot run on the device.
|
| 136 |
+
*
|
| 137 |
+
* ERRORS
|
| 138 |
+
*
|
| 139 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 140 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 141 |
+
* current implementation or device is invalid
|
| 142 |
+
*/
|
| 143 |
+
static __OCC_INLINE
|
| 144 |
+
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 145 |
+
cudaOccResult *result, // out
|
| 146 |
+
const cudaOccDeviceProp *properties, // in
|
| 147 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 148 |
+
const cudaOccDeviceState *state, // in
|
| 149 |
+
int blockSize, // in
|
| 150 |
+
size_t dynamicSmemSize); // in
|
| 151 |
+
|
| 152 |
+
/**
|
| 153 |
+
* The CUDA launch configurator C API suggests a grid / block size pair (in
|
| 154 |
+
* minGridSize and blockSize) that achieves the best potential occupancy
|
| 155 |
+
* (i.e. maximum number of active warps with the smallest number of blocks) for
|
| 156 |
+
* the given function described by attributes, on a device described by
|
| 157 |
+
* properties with settings in state.
|
| 158 |
+
*
|
| 159 |
+
* If per-block dynamic shared memory allocation is not needed, the user should
|
| 160 |
+
* leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
|
| 161 |
+
*
|
| 162 |
+
* If per-block dynamic shared memory allocation is needed, then if the dynamic
|
| 163 |
+
* shared memory size is constant regardless of block size, the size should be
|
| 164 |
+
* passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
|
| 165 |
+
* NULL.
|
| 166 |
+
*
|
| 167 |
+
* Otherwise, if the per-block dynamic shared memory size varies with different
|
| 168 |
+
* block sizes, the user needs to provide a pointer to an unary function through
|
| 169 |
+
* blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
|
| 170 |
+
* a block of the function for any given block size. dynamicSMemSize is
|
| 171 |
+
* ignored. An example signature is:
|
| 172 |
+
*
|
| 173 |
+
* // Take block size, returns dynamic shared memory needed
|
| 174 |
+
* size_t blockToSmem(int blockSize);
|
| 175 |
+
*
|
| 176 |
+
* RETURN VALUE
|
| 177 |
+
*
|
| 178 |
+
* The suggested block size and the minimum number of blocks needed to achieve
|
| 179 |
+
* the maximum occupancy are returned through blockSize and minGridSize.
|
| 180 |
+
*
|
| 181 |
+
* If *blockSize is 0, then the given combination cannot run on the device.
|
| 182 |
+
*
|
| 183 |
+
* ERRORS
|
| 184 |
+
*
|
| 185 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 186 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 187 |
+
* current implementation or device is invalid
|
| 188 |
+
*
|
| 189 |
+
*/
|
| 190 |
+
static __OCC_INLINE
|
| 191 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 192 |
+
int *minGridSize, // out
|
| 193 |
+
int *blockSize, // out
|
| 194 |
+
const cudaOccDeviceProp *properties, // in
|
| 195 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 196 |
+
const cudaOccDeviceState *state, // in
|
| 197 |
+
size_t (*blockSizeToDynamicSMemSize)(int), // in
|
| 198 |
+
size_t dynamicSMemSize); // in
|
| 199 |
+
|
| 200 |
+
/**
|
| 201 |
+
* The CUDA launch configurator C++ API suggests a grid / block size pair (in
|
| 202 |
+
* minGridSize and blockSize) that achieves the best potential occupancy
|
| 203 |
+
* (i.e. the maximum number of active warps with the smallest number of blocks)
|
| 204 |
+
* for the given function described by attributes, on a device described by
|
| 205 |
+
* properties with settings in state.
|
| 206 |
+
*
|
| 207 |
+
* If per-block dynamic shared memory allocation is 0 or constant regardless of
|
| 208 |
+
* block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
|
| 209 |
+
* configure the launch. A constant dynamic shared memory allocation size in
|
| 210 |
+
* bytes can be passed through dynamicSMemSize.
|
| 211 |
+
*
|
| 212 |
+
* Otherwise, if the per-block dynamic shared memory size varies with different
|
| 213 |
+
* block sizes, the user needs to use
|
| 214 |
+
* cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
|
| 215 |
+
* functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
|
| 216 |
+
* computes the dynamic shared memory needed by func for any given block
|
| 217 |
+
* size. An example signature is:
|
| 218 |
+
*
|
| 219 |
+
* // Take block size, returns per-block dynamic shared memory needed
|
| 220 |
+
* size_t blockToSmem(int blockSize);
|
| 221 |
+
*
|
| 222 |
+
* RETURN VALUE
|
| 223 |
+
*
|
| 224 |
+
* The suggested block size and the minimum number of blocks needed to achieve
|
| 225 |
+
* the maximum occupancy are returned through blockSize and minGridSize.
|
| 226 |
+
*
|
| 227 |
+
* If *blockSize is 0, then the given combination cannot run on the device.
|
| 228 |
+
*
|
| 229 |
+
* ERRORS
|
| 230 |
+
*
|
| 231 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 232 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 233 |
+
* current implementation or device is invalid
|
| 234 |
+
*
|
| 235 |
+
*/
|
| 236 |
+
|
| 237 |
+
#if defined(__cplusplus)
|
| 238 |
+
namespace {
|
| 239 |
+
|
| 240 |
+
__OCC_INLINE
|
| 241 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 242 |
+
int *minGridSize, // out
|
| 243 |
+
int *blockSize, // out
|
| 244 |
+
const cudaOccDeviceProp *properties, // in
|
| 245 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 246 |
+
const cudaOccDeviceState *state, // in
|
| 247 |
+
size_t dynamicSMemSize = 0); // in
|
| 248 |
+
|
| 249 |
+
template <typename UnaryFunction>
|
| 250 |
+
__OCC_INLINE
|
| 251 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
|
| 252 |
+
int *minGridSize, // out
|
| 253 |
+
int *blockSize, // out
|
| 254 |
+
const cudaOccDeviceProp *properties, // in
|
| 255 |
+
const cudaOccFuncAttributes *attributes, // in
|
| 256 |
+
const cudaOccDeviceState *state, // in
|
| 257 |
+
UnaryFunction blockSizeToDynamicSMemSize); // in
|
| 258 |
+
|
| 259 |
+
} // namespace anonymous
|
| 260 |
+
#endif // defined(__cplusplus)
|
| 261 |
+
|
| 262 |
+
/**
|
| 263 |
+
*
|
| 264 |
+
* The CUDA dynamic shared memory calculator computes the maximum size of
|
| 265 |
+
* per-block dynamic shared memory if we want to place numBlocks blocks
|
| 266 |
+
* on an SM.
|
| 267 |
+
*
|
| 268 |
+
* RETURN VALUE
|
| 269 |
+
*
|
| 270 |
+
* Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
|
| 271 |
+
* numBlocks blocks per SM.
|
| 272 |
+
*
|
| 273 |
+
* ERRORS
|
| 274 |
+
*
|
| 275 |
+
* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
|
| 276 |
+
* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
|
| 277 |
+
* current implementation or device is invalid
|
| 278 |
+
*
|
| 279 |
+
*/
|
| 280 |
+
static __OCC_INLINE
|
| 281 |
+
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
|
| 282 |
+
size_t *dynamicSmemSize,
|
| 283 |
+
const cudaOccDeviceProp *properties,
|
| 284 |
+
const cudaOccFuncAttributes *attributes,
|
| 285 |
+
const cudaOccDeviceState *state,
|
| 286 |
+
int numBlocks,
|
| 287 |
+
int blockSize);
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* Data structures
|
| 291 |
+
*
|
| 292 |
+
* These structures are subject to change for future architecture and CUDA
|
| 293 |
+
* releases. C users should initialize the structure as {0}.
|
| 294 |
+
*
|
| 295 |
+
*/
|
| 296 |
+
|
| 297 |
+
/**
|
| 298 |
+
* Device descriptor
|
| 299 |
+
*
|
| 300 |
+
* This structure describes a device.
|
| 301 |
+
*/
|
| 302 |
+
struct cudaOccDeviceProp {
|
| 303 |
+
int computeMajor; // Compute capability major version
|
| 304 |
+
int computeMinor; // Compute capability minor
|
| 305 |
+
// version. None supported minor version
|
| 306 |
+
// may cause error
|
| 307 |
+
int maxThreadsPerBlock; // Maximum number of threads per block
|
| 308 |
+
int maxThreadsPerMultiprocessor; // Maximum number of threads per SM
|
| 309 |
+
// i.e. (Max. number of warps) x (warp
|
| 310 |
+
// size)
|
| 311 |
+
int regsPerBlock; // Maximum number of registers per block
|
| 312 |
+
int regsPerMultiprocessor; // Maximum number of registers per SM
|
| 313 |
+
int warpSize; // Warp size
|
| 314 |
+
size_t sharedMemPerBlock; // Maximum shared memory size per block
|
| 315 |
+
size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM
|
| 316 |
+
int numSms; // Number of SMs available
|
| 317 |
+
size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block
|
| 318 |
+
size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver
|
| 319 |
+
|
| 320 |
+
#ifdef __cplusplus
|
| 321 |
+
// This structure can be converted from a cudaDeviceProp structure for users
|
| 322 |
+
// that use this header in their CUDA applications.
|
| 323 |
+
//
|
| 324 |
+
// If the application have access to the CUDA Runtime API, the application
|
| 325 |
+
// can obtain the device properties of a CUDA device through
|
| 326 |
+
// cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
|
| 327 |
+
// cudaDeviceProp structure.
|
| 328 |
+
//
|
| 329 |
+
// Example:
|
| 330 |
+
/*
|
| 331 |
+
{
|
| 332 |
+
cudaDeviceProp prop;
|
| 333 |
+
|
| 334 |
+
cudaGetDeviceProperties(&prop, ...);
|
| 335 |
+
|
| 336 |
+
cudaOccDeviceProp occProp = prop;
|
| 337 |
+
|
| 338 |
+
...
|
| 339 |
+
|
| 340 |
+
cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
|
| 341 |
+
}
|
| 342 |
+
*/
|
| 343 |
+
//
|
| 344 |
+
template<typename DeviceProp>
|
| 345 |
+
__OCC_INLINE
|
| 346 |
+
cudaOccDeviceProp(const DeviceProp &props)
|
| 347 |
+
: computeMajor (props.major),
|
| 348 |
+
computeMinor (props.minor),
|
| 349 |
+
maxThreadsPerBlock (props.maxThreadsPerBlock),
|
| 350 |
+
maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
|
| 351 |
+
regsPerBlock (props.regsPerBlock),
|
| 352 |
+
regsPerMultiprocessor (props.regsPerMultiprocessor),
|
| 353 |
+
warpSize (props.warpSize),
|
| 354 |
+
sharedMemPerBlock (props.sharedMemPerBlock),
|
| 355 |
+
sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor),
|
| 356 |
+
numSms (props.multiProcessorCount),
|
| 357 |
+
sharedMemPerBlockOptin (props.sharedMemPerBlockOptin),
|
| 358 |
+
reservedSharedMemPerBlock (props.reservedSharedMemPerBlock)
|
| 359 |
+
{}
|
| 360 |
+
|
| 361 |
+
__OCC_INLINE
|
| 362 |
+
cudaOccDeviceProp()
|
| 363 |
+
: computeMajor (0),
|
| 364 |
+
computeMinor (0),
|
| 365 |
+
maxThreadsPerBlock (0),
|
| 366 |
+
maxThreadsPerMultiprocessor (0),
|
| 367 |
+
regsPerBlock (0),
|
| 368 |
+
regsPerMultiprocessor (0),
|
| 369 |
+
warpSize (0),
|
| 370 |
+
sharedMemPerBlock (0),
|
| 371 |
+
sharedMemPerMultiprocessor (0),
|
| 372 |
+
numSms (0),
|
| 373 |
+
sharedMemPerBlockOptin (0),
|
| 374 |
+
reservedSharedMemPerBlock (0)
|
| 375 |
+
{}
|
| 376 |
+
#endif // __cplusplus
|
| 377 |
+
};
|
| 378 |
+
|
| 379 |
+
/**
|
| 380 |
+
* Partitioned global caching option
|
| 381 |
+
*/
|
| 382 |
+
typedef enum cudaOccPartitionedGCConfig_enum {
|
| 383 |
+
PARTITIONED_GC_OFF, // Disable partitioned global caching
|
| 384 |
+
PARTITIONED_GC_ON, // Prefer partitioned global caching
|
| 385 |
+
PARTITIONED_GC_ON_STRICT // Force partitioned global caching
|
| 386 |
+
} cudaOccPartitionedGCConfig;
|
| 387 |
+
|
| 388 |
+
/**
|
| 389 |
+
* Per function opt in maximum dynamic shared memory limit
|
| 390 |
+
*/
|
| 391 |
+
typedef enum cudaOccFuncShmemConfig_enum {
|
| 392 |
+
FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit
|
| 393 |
+
FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit
|
| 394 |
+
} cudaOccFuncShmemConfig;
|
| 395 |
+
|
| 396 |
+
/**
|
| 397 |
+
* Function descriptor
|
| 398 |
+
*
|
| 399 |
+
* This structure describes a CUDA function.
|
| 400 |
+
*/
|
| 401 |
+
struct cudaOccFuncAttributes {
|
| 402 |
+
int maxThreadsPerBlock; // Maximum block size the function can work with. If
|
| 403 |
+
// unlimited, use INT_MAX or any value greater than
|
| 404 |
+
// or equal to maxThreadsPerBlock of the device
|
| 405 |
+
int numRegs; // Number of registers used. When the function is
|
| 406 |
+
// launched on device, the register count may change
|
| 407 |
+
// due to internal tools requirements.
|
| 408 |
+
size_t sharedSizeBytes; // Number of static shared memory used
|
| 409 |
+
|
| 410 |
+
cudaOccPartitionedGCConfig partitionedGCConfig;
|
| 411 |
+
// Partitioned global caching is required to enable
|
| 412 |
+
// caching on certain chips, such as sm_52
|
| 413 |
+
// devices. Partitioned global caching can be
|
| 414 |
+
// automatically disabled if the occupancy
|
| 415 |
+
// requirement of the launch cannot support caching.
|
| 416 |
+
//
|
| 417 |
+
// To override this behavior with caching on and
|
| 418 |
+
// calculate occupancy strictly according to the
|
| 419 |
+
// preference, set partitionedGCConfig to
|
| 420 |
+
// PARTITIONED_GC_ON_STRICT. This is especially
|
| 421 |
+
// useful for experimenting and finding launch
|
| 422 |
+
// configurations (MaxPotentialOccupancyBlockSize)
|
| 423 |
+
// that allow global caching to take effect.
|
| 424 |
+
//
|
| 425 |
+
// This flag only affects the occupancy calculation.
|
| 426 |
+
|
| 427 |
+
cudaOccFuncShmemConfig shmemLimitConfig;
|
| 428 |
+
// Certain chips like sm_70 allow a user to opt into
|
| 429 |
+
// a higher per block limit of dynamic shared memory
|
| 430 |
+
// This optin is performed on a per function basis
|
| 431 |
+
// using the cuFuncSetAttribute function
|
| 432 |
+
|
| 433 |
+
size_t maxDynamicSharedSizeBytes;
|
| 434 |
+
// User set limit on maximum dynamic shared memory
|
| 435 |
+
// usable by the kernel
|
| 436 |
+
// This limit is set using the cuFuncSetAttribute
|
| 437 |
+
// function.
|
| 438 |
+
|
| 439 |
+
int numBlockBarriers; // Number of block barriers used (default to 1)
|
| 440 |
+
#ifdef __cplusplus
|
| 441 |
+
// This structure can be converted from a cudaFuncAttributes structure for
|
| 442 |
+
// users that use this header in their CUDA applications.
|
| 443 |
+
//
|
| 444 |
+
// If the application have access to the CUDA Runtime API, the application
|
| 445 |
+
// can obtain the function attributes of a CUDA kernel function through
|
| 446 |
+
// cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
|
| 447 |
+
// cudaFuncAttributes structure.
|
| 448 |
+
//
|
| 449 |
+
// Example:
|
| 450 |
+
/*
|
| 451 |
+
__global__ void foo() {...}
|
| 452 |
+
|
| 453 |
+
...
|
| 454 |
+
|
| 455 |
+
{
|
| 456 |
+
cudaFuncAttributes attr;
|
| 457 |
+
|
| 458 |
+
cudaFuncGetAttributes(&attr, foo);
|
| 459 |
+
|
| 460 |
+
cudaOccFuncAttributes occAttr = attr;
|
| 461 |
+
|
| 462 |
+
...
|
| 463 |
+
|
| 464 |
+
cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
|
| 465 |
+
}
|
| 466 |
+
*/
|
| 467 |
+
//
|
| 468 |
+
template<typename FuncAttributes>
|
| 469 |
+
__OCC_INLINE
|
| 470 |
+
cudaOccFuncAttributes(const FuncAttributes &attr)
|
| 471 |
+
: maxThreadsPerBlock (attr.maxThreadsPerBlock),
|
| 472 |
+
numRegs (attr.numRegs),
|
| 473 |
+
sharedSizeBytes (attr.sharedSizeBytes),
|
| 474 |
+
partitionedGCConfig (PARTITIONED_GC_OFF),
|
| 475 |
+
shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN),
|
| 476 |
+
maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
|
| 477 |
+
numBlockBarriers (1)
|
| 478 |
+
{}
|
| 479 |
+
|
| 480 |
+
__OCC_INLINE
|
| 481 |
+
cudaOccFuncAttributes()
|
| 482 |
+
: maxThreadsPerBlock (0),
|
| 483 |
+
numRegs (0),
|
| 484 |
+
sharedSizeBytes (0),
|
| 485 |
+
partitionedGCConfig (PARTITIONED_GC_OFF),
|
| 486 |
+
shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT),
|
| 487 |
+
maxDynamicSharedSizeBytes (0),
|
| 488 |
+
numBlockBarriers (0)
|
| 489 |
+
{}
|
| 490 |
+
#endif
|
| 491 |
+
};
|
| 492 |
+
|
| 493 |
+
typedef enum cudaOccCacheConfig_enum {
|
| 494 |
+
CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default)
|
| 495 |
+
CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
|
| 496 |
+
CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory
|
| 497 |
+
CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory
|
| 498 |
+
} cudaOccCacheConfig;
|
| 499 |
+
|
| 500 |
+
typedef enum cudaOccCarveoutConfig_enum {
|
| 501 |
+
SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default)
|
| 502 |
+
SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache
|
| 503 |
+
SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory
|
| 504 |
+
SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache
|
| 505 |
+
} cudaOccCarveoutConfig;
|
| 506 |
+
|
| 507 |
+
/**
|
| 508 |
+
* Device state descriptor
|
| 509 |
+
*
|
| 510 |
+
* This structure describes device settings that affect occupancy calculation.
|
| 511 |
+
*/
|
| 512 |
+
struct cudaOccDeviceState
|
| 513 |
+
{
|
| 514 |
+
// Cache / shared memory split preference. Deprecated on Volta
|
| 515 |
+
cudaOccCacheConfig cacheConfig;
|
| 516 |
+
// Shared memory / L1 split preference. Supported on only Volta
|
| 517 |
+
int carveoutConfig;
|
| 518 |
+
|
| 519 |
+
#ifdef __cplusplus
|
| 520 |
+
__OCC_INLINE
|
| 521 |
+
cudaOccDeviceState()
|
| 522 |
+
: cacheConfig (CACHE_PREFER_NONE),
|
| 523 |
+
carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT)
|
| 524 |
+
{}
|
| 525 |
+
#endif
|
| 526 |
+
};
|
| 527 |
+
|
| 528 |
+
typedef enum cudaOccLimitingFactor_enum {
|
| 529 |
+
// Occupancy limited due to:
|
| 530 |
+
OCC_LIMIT_WARPS = 0x01, // - warps available
|
| 531 |
+
OCC_LIMIT_REGISTERS = 0x02, // - registers available
|
| 532 |
+
OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
|
| 533 |
+
OCC_LIMIT_BLOCKS = 0x08, // - blocks available
|
| 534 |
+
OCC_LIMIT_BARRIERS = 0x10 // - barrier available
|
| 535 |
+
} cudaOccLimitingFactor;
|
| 536 |
+
|
| 537 |
+
/**
|
| 538 |
+
* Occupancy output
|
| 539 |
+
*
|
| 540 |
+
* This structure contains occupancy calculator's output.
|
| 541 |
+
*/
|
| 542 |
+
struct cudaOccResult {
|
| 543 |
+
int activeBlocksPerMultiprocessor; // Occupancy
|
| 544 |
+
unsigned int limitingFactors; // Factors that limited occupancy. A bit
|
| 545 |
+
// field that counts the limiting
|
| 546 |
+
// factors, see cudaOccLimitingFactor
|
| 547 |
+
int blockLimitRegs; // Occupancy due to register
|
| 548 |
+
// usage, INT_MAX if the kernel does not
|
| 549 |
+
// use any register.
|
| 550 |
+
int blockLimitSharedMem; // Occupancy due to shared memory
|
| 551 |
+
// usage, INT_MAX if the kernel does not
|
| 552 |
+
// use shared memory.
|
| 553 |
+
int blockLimitWarps; // Occupancy due to block size limit
|
| 554 |
+
int blockLimitBlocks; // Occupancy due to maximum number of blocks
|
| 555 |
+
// managable per SM
|
| 556 |
+
int blockLimitBarriers; // Occupancy due to block barrier usage
|
| 557 |
+
int allocatedRegistersPerBlock; // Actual number of registers allocated per
|
| 558 |
+
// block
|
| 559 |
+
size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
|
| 560 |
+
// per block
|
| 561 |
+
cudaOccPartitionedGCConfig partitionedGCConfig;
|
| 562 |
+
// Report if partitioned global caching
|
| 563 |
+
// is actually enabled.
|
| 564 |
+
};
|
| 565 |
+
|
| 566 |
+
/**
|
| 567 |
+
* Partitioned global caching support
|
| 568 |
+
*
|
| 569 |
+
* See cudaOccPartitionedGlobalCachingModeSupport
|
| 570 |
+
*/
|
| 571 |
+
typedef enum cudaOccPartitionedGCSupport_enum {
|
| 572 |
+
PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported
|
| 573 |
+
PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported
|
| 574 |
+
} cudaOccPartitionedGCSupport;
|
| 575 |
+
|
| 576 |
+
/**
|
| 577 |
+
* Implementation
|
| 578 |
+
*/
|
| 579 |
+
|
| 580 |
+
/**
|
| 581 |
+
* Max compute capability supported
|
| 582 |
+
*/
|
| 583 |
+
#define __CUDA_OCC_MAJOR__ 9
|
| 584 |
+
#define __CUDA_OCC_MINOR__ 0
|
| 585 |
+
|
| 586 |
+
//////////////////////////////////////////
|
| 587 |
+
// Mathematical Helper Functions //
|
| 588 |
+
//////////////////////////////////////////
|
| 589 |
+
|
| 590 |
+
static __OCC_INLINE int __occMin(int lhs, int rhs)
|
| 591 |
+
{
|
| 592 |
+
return rhs < lhs ? rhs : lhs;
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
static __OCC_INLINE int __occDivideRoundUp(int x, int y)
|
| 596 |
+
{
|
| 597 |
+
return (x + (y - 1)) / y;
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
static __OCC_INLINE int __occRoundUp(int x, int y)
|
| 601 |
+
{
|
| 602 |
+
return y * __occDivideRoundUp(x, y);
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
//////////////////////////////////////////
|
| 606 |
+
// Architectural Properties //
|
| 607 |
+
//////////////////////////////////////////
|
| 608 |
+
|
| 609 |
+
/**
|
| 610 |
+
* Granularity of shared memory allocation
|
| 611 |
+
*/
|
| 612 |
+
static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
|
| 613 |
+
{
|
| 614 |
+
int value;
|
| 615 |
+
|
| 616 |
+
switch(properties->computeMajor) {
|
| 617 |
+
case 3:
|
| 618 |
+
case 5:
|
| 619 |
+
case 6:
|
| 620 |
+
case 7:
|
| 621 |
+
value = 256;
|
| 622 |
+
break;
|
| 623 |
+
case 8:
|
| 624 |
+
case 9:
|
| 625 |
+
value = 128;
|
| 626 |
+
break;
|
| 627 |
+
default:
|
| 628 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
*limit = value;
|
| 632 |
+
|
| 633 |
+
return CUDA_OCC_SUCCESS;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
/**
|
| 637 |
+
* Maximum number of registers per thread
|
| 638 |
+
*/
|
| 639 |
+
static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
|
| 640 |
+
{
|
| 641 |
+
int value;
|
| 642 |
+
|
| 643 |
+
switch(properties->computeMajor) {
|
| 644 |
+
case 3:
|
| 645 |
+
case 5:
|
| 646 |
+
case 6:
|
| 647 |
+
value = 255;
|
| 648 |
+
break;
|
| 649 |
+
case 7:
|
| 650 |
+
case 8:
|
| 651 |
+
case 9:
|
| 652 |
+
value = 256;
|
| 653 |
+
break;
|
| 654 |
+
default:
|
| 655 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
*limit = value;
|
| 659 |
+
|
| 660 |
+
return CUDA_OCC_SUCCESS;
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
/**
|
| 664 |
+
* Granularity of register allocation
|
| 665 |
+
*/
|
| 666 |
+
static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
|
| 667 |
+
{
|
| 668 |
+
int value;
|
| 669 |
+
|
| 670 |
+
switch(properties->computeMajor) {
|
| 671 |
+
case 3:
|
| 672 |
+
case 5:
|
| 673 |
+
case 6:
|
| 674 |
+
case 7:
|
| 675 |
+
case 8:
|
| 676 |
+
case 9:
|
| 677 |
+
value = 256;
|
| 678 |
+
break;
|
| 679 |
+
default:
|
| 680 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
*limit = value;
|
| 684 |
+
|
| 685 |
+
return CUDA_OCC_SUCCESS;
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
/**
|
| 689 |
+
* Number of sub-partitions
|
| 690 |
+
*/
|
| 691 |
+
static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
|
| 692 |
+
{
|
| 693 |
+
int value;
|
| 694 |
+
|
| 695 |
+
switch(properties->computeMajor) {
|
| 696 |
+
case 3:
|
| 697 |
+
case 5:
|
| 698 |
+
case 7:
|
| 699 |
+
case 8:
|
| 700 |
+
case 9:
|
| 701 |
+
value = 4;
|
| 702 |
+
break;
|
| 703 |
+
case 6:
|
| 704 |
+
value = properties->computeMinor ? 4 : 2;
|
| 705 |
+
break;
|
| 706 |
+
default:
|
| 707 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
*limit = value;
|
| 711 |
+
|
| 712 |
+
return CUDA_OCC_SUCCESS;
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
/**
|
| 717 |
+
* Maximum number of blocks that can run simultaneously on a multiprocessor
|
| 718 |
+
*/
|
| 719 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
|
| 720 |
+
{
|
| 721 |
+
int value;
|
| 722 |
+
|
| 723 |
+
switch(properties->computeMajor) {
|
| 724 |
+
case 3:
|
| 725 |
+
value = 16;
|
| 726 |
+
break;
|
| 727 |
+
case 5:
|
| 728 |
+
case 6:
|
| 729 |
+
value = 32;
|
| 730 |
+
break;
|
| 731 |
+
case 7: {
|
| 732 |
+
int isTuring = properties->computeMinor == 5;
|
| 733 |
+
value = (isTuring) ? 16 : 32;
|
| 734 |
+
break;
|
| 735 |
+
}
|
| 736 |
+
case 8:
|
| 737 |
+
if (properties->computeMinor == 0) {
|
| 738 |
+
value = 32;
|
| 739 |
+
}
|
| 740 |
+
else if (properties->computeMinor == 9) {
|
| 741 |
+
value = 24;
|
| 742 |
+
}
|
| 743 |
+
else {
|
| 744 |
+
value = 16;
|
| 745 |
+
}
|
| 746 |
+
break;
|
| 747 |
+
case 9:
|
| 748 |
+
value = 32;
|
| 749 |
+
break;
|
| 750 |
+
default:
|
| 751 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 752 |
+
}
|
| 753 |
+
|
| 754 |
+
*limit = value;
|
| 755 |
+
|
| 756 |
+
return CUDA_OCC_SUCCESS;
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
/**
|
| 760 |
+
* Align up shared memory based on compute major configurations
|
| 761 |
+
*/
|
| 762 |
+
static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
|
| 763 |
+
{
|
| 764 |
+
// Volta and Turing have shared L1 cache / shared memory, and support cache
|
| 765 |
+
// configuration to trade one for the other. These values are needed to
|
| 766 |
+
// map carveout config ratio to the next available architecture size
|
| 767 |
+
size_t size = *shMemSize;
|
| 768 |
+
|
| 769 |
+
switch (properties->computeMajor) {
|
| 770 |
+
case 7: {
|
| 771 |
+
// Turing supports 32KB and 64KB shared mem.
|
| 772 |
+
int isTuring = properties->computeMinor == 5;
|
| 773 |
+
if (isTuring) {
|
| 774 |
+
if (size <= 32 * 1024) {
|
| 775 |
+
*shMemSize = 32 * 1024;
|
| 776 |
+
}
|
| 777 |
+
else if (size <= 64 * 1024) {
|
| 778 |
+
*shMemSize = 64 * 1024;
|
| 779 |
+
}
|
| 780 |
+
else {
|
| 781 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 782 |
+
}
|
| 783 |
+
}
|
| 784 |
+
// Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
|
| 785 |
+
else {
|
| 786 |
+
if (size == 0) {
|
| 787 |
+
*shMemSize = 0;
|
| 788 |
+
}
|
| 789 |
+
else if (size <= 8 * 1024) {
|
| 790 |
+
*shMemSize = 8 * 1024;
|
| 791 |
+
}
|
| 792 |
+
else if (size <= 16 * 1024) {
|
| 793 |
+
*shMemSize = 16 * 1024;
|
| 794 |
+
}
|
| 795 |
+
else if (size <= 32 * 1024) {
|
| 796 |
+
*shMemSize = 32 * 1024;
|
| 797 |
+
}
|
| 798 |
+
else if (size <= 64 * 1024) {
|
| 799 |
+
*shMemSize = 64 * 1024;
|
| 800 |
+
}
|
| 801 |
+
else if (size <= 96 * 1024) {
|
| 802 |
+
*shMemSize = 96 * 1024;
|
| 803 |
+
}
|
| 804 |
+
else {
|
| 805 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 806 |
+
}
|
| 807 |
+
}
|
| 808 |
+
break;
|
| 809 |
+
}
|
| 810 |
+
case 8:
|
| 811 |
+
if (properties->computeMinor == 0 || properties->computeMinor == 7) {
|
| 812 |
+
if (size == 0) {
|
| 813 |
+
*shMemSize = 0;
|
| 814 |
+
}
|
| 815 |
+
else if (size <= 8 * 1024) {
|
| 816 |
+
*shMemSize = 8 * 1024;
|
| 817 |
+
}
|
| 818 |
+
else if (size <= 16 * 1024) {
|
| 819 |
+
*shMemSize = 16 * 1024;
|
| 820 |
+
}
|
| 821 |
+
else if (size <= 32 * 1024) {
|
| 822 |
+
*shMemSize = 32 * 1024;
|
| 823 |
+
}
|
| 824 |
+
else if (size <= 64 * 1024) {
|
| 825 |
+
*shMemSize = 64 * 1024;
|
| 826 |
+
}
|
| 827 |
+
else if (size <= 100 * 1024) {
|
| 828 |
+
*shMemSize = 100 * 1024;
|
| 829 |
+
}
|
| 830 |
+
else if (size <= 132 * 1024) {
|
| 831 |
+
*shMemSize = 132 * 1024;
|
| 832 |
+
}
|
| 833 |
+
else if (size <= 164 * 1024) {
|
| 834 |
+
*shMemSize = 164 * 1024;
|
| 835 |
+
}
|
| 836 |
+
else {
|
| 837 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 838 |
+
}
|
| 839 |
+
}
|
| 840 |
+
else {
|
| 841 |
+
if (size == 0) {
|
| 842 |
+
*shMemSize = 0;
|
| 843 |
+
}
|
| 844 |
+
else if (size <= 8 * 1024) {
|
| 845 |
+
*shMemSize = 8 * 1024;
|
| 846 |
+
}
|
| 847 |
+
else if (size <= 16 * 1024) {
|
| 848 |
+
*shMemSize = 16 * 1024;
|
| 849 |
+
}
|
| 850 |
+
else if (size <= 32 * 1024) {
|
| 851 |
+
*shMemSize = 32 * 1024;
|
| 852 |
+
}
|
| 853 |
+
else if (size <= 64 * 1024) {
|
| 854 |
+
*shMemSize = 64 * 1024;
|
| 855 |
+
}
|
| 856 |
+
else if (size <= 100 * 1024) {
|
| 857 |
+
*shMemSize = 100 * 1024;
|
| 858 |
+
}
|
| 859 |
+
else {
|
| 860 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 861 |
+
}
|
| 862 |
+
}
|
| 863 |
+
break;
|
| 864 |
+
case 9: {
|
| 865 |
+
if (size == 0) {
|
| 866 |
+
*shMemSize = 0;
|
| 867 |
+
}
|
| 868 |
+
else if (size <= 8 * 1024) {
|
| 869 |
+
*shMemSize = 8 * 1024;
|
| 870 |
+
}
|
| 871 |
+
else if (size <= 16 * 1024) {
|
| 872 |
+
*shMemSize = 16 * 1024;
|
| 873 |
+
}
|
| 874 |
+
else if (size <= 32 * 1024) {
|
| 875 |
+
*shMemSize = 32 * 1024;
|
| 876 |
+
}
|
| 877 |
+
else if (size <= 64 * 1024) {
|
| 878 |
+
*shMemSize = 64 * 1024;
|
| 879 |
+
}
|
| 880 |
+
else if (size <= 100 * 1024) {
|
| 881 |
+
*shMemSize = 100 * 1024;
|
| 882 |
+
}
|
| 883 |
+
else if (size <= 132 * 1024) {
|
| 884 |
+
*shMemSize = 132 * 1024;
|
| 885 |
+
}
|
| 886 |
+
else if (size <= 164 * 1024) {
|
| 887 |
+
*shMemSize = 164 * 1024;
|
| 888 |
+
}
|
| 889 |
+
else if (size <= 196 * 1024) {
|
| 890 |
+
*shMemSize = 196 * 1024;
|
| 891 |
+
}
|
| 892 |
+
else if (size <= 228 * 1024) {
|
| 893 |
+
*shMemSize = 228 * 1024;
|
| 894 |
+
}
|
| 895 |
+
else {
|
| 896 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 897 |
+
}
|
| 898 |
+
break;
|
| 899 |
+
}
|
| 900 |
+
default:
|
| 901 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 902 |
+
}
|
| 903 |
+
|
| 904 |
+
return CUDA_OCC_SUCCESS;
|
| 905 |
+
}
|
| 906 |
+
|
| 907 |
+
/**
|
| 908 |
+
* Shared memory based on the new carveoutConfig API introduced with Volta
|
| 909 |
+
*/
|
| 910 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 911 |
+
{
|
| 912 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 913 |
+
size_t preferenceShmemSize;
|
| 914 |
+
|
| 915 |
+
// CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
|
| 916 |
+
// devices. This preference will take precedence over the older cacheConfig setting.
|
| 917 |
+
// Map cacheConfig to its effective preference value.
|
| 918 |
+
int effectivePreference = state->carveoutConfig;
|
| 919 |
+
if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
|
| 920 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 921 |
+
}
|
| 922 |
+
|
| 923 |
+
if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
|
| 924 |
+
switch (state->cacheConfig)
|
| 925 |
+
{
|
| 926 |
+
case CACHE_PREFER_L1:
|
| 927 |
+
effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
|
| 928 |
+
break;
|
| 929 |
+
case CACHE_PREFER_SHARED:
|
| 930 |
+
effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
|
| 931 |
+
break;
|
| 932 |
+
case CACHE_PREFER_EQUAL:
|
| 933 |
+
effectivePreference = SHAREDMEM_CARVEOUT_HALF;
|
| 934 |
+
break;
|
| 935 |
+
default:
|
| 936 |
+
effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
|
| 937 |
+
break;
|
| 938 |
+
}
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
|
| 942 |
+
preferenceShmemSize = properties->sharedMemPerMultiprocessor;
|
| 943 |
+
}
|
| 944 |
+
else {
|
| 945 |
+
preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
|
| 946 |
+
}
|
| 947 |
+
|
| 948 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
|
| 949 |
+
*limit = preferenceShmemSize;
|
| 950 |
+
return status;
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
/**
|
| 954 |
+
* Shared memory based on the cacheConfig
|
| 955 |
+
*/
|
| 956 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 957 |
+
{
|
| 958 |
+
size_t bytes = 0;
|
| 959 |
+
size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
|
| 960 |
+
cudaOccCacheConfig cacheConfig = state->cacheConfig;
|
| 961 |
+
|
| 962 |
+
// Kepler has shared L1 cache / shared memory, and support cache
|
| 963 |
+
// configuration to trade one for the other. These values are needed to
|
| 964 |
+
// calculate the correct shared memory size for user requested cache
|
| 965 |
+
// configuration.
|
| 966 |
+
//
|
| 967 |
+
size_t minCacheSize = 16384;
|
| 968 |
+
size_t maxCacheSize = 49152;
|
| 969 |
+
size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
|
| 970 |
+
size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;
|
| 971 |
+
|
| 972 |
+
switch (properties->computeMajor) {
|
| 973 |
+
case 3:
|
| 974 |
+
// Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
|
| 975 |
+
// is shared memory.
|
| 976 |
+
//
|
| 977 |
+
switch (cacheConfig) {
|
| 978 |
+
default :
|
| 979 |
+
case CACHE_PREFER_NONE:
|
| 980 |
+
case CACHE_PREFER_SHARED:
|
| 981 |
+
bytes = sharedMemPerMultiprocessorHigh;
|
| 982 |
+
break;
|
| 983 |
+
case CACHE_PREFER_L1:
|
| 984 |
+
bytes = sharedMemPerMultiprocessorLow;
|
| 985 |
+
break;
|
| 986 |
+
case CACHE_PREFER_EQUAL:
|
| 987 |
+
// Equal is the mid-point between high and low. It should be
|
| 988 |
+
// equivalent to low + 16KB.
|
| 989 |
+
//
|
| 990 |
+
bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
|
| 991 |
+
break;
|
| 992 |
+
}
|
| 993 |
+
break;
|
| 994 |
+
case 5:
|
| 995 |
+
case 6:
|
| 996 |
+
// Maxwell and Pascal have dedicated shared memory.
|
| 997 |
+
//
|
| 998 |
+
bytes = sharedMemPerMultiprocessorHigh;
|
| 999 |
+
break;
|
| 1000 |
+
default:
|
| 1001 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 1002 |
+
}
|
| 1003 |
+
|
| 1004 |
+
*limit = bytes;
|
| 1005 |
+
|
| 1006 |
+
return CUDA_OCC_SUCCESS;
|
| 1007 |
+
}
|
| 1008 |
+
|
| 1009 |
+
/**
|
| 1010 |
+
* Shared memory based on config requested by User
|
| 1011 |
+
*/
|
| 1012 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
|
| 1013 |
+
{
|
| 1014 |
+
// Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
|
| 1015 |
+
// it is handled separately from the cache config preference.
|
| 1016 |
+
if (properties->computeMajor >= 7) {
|
| 1017 |
+
return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
|
| 1018 |
+
}
|
| 1019 |
+
return cudaOccSMemPreference(limit, properties, state);
|
| 1020 |
+
}
|
| 1021 |
+
|
| 1022 |
+
/**
|
| 1023 |
+
* Return the per block shared memory limit based on function config
|
| 1024 |
+
*/
|
| 1025 |
+
static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
|
| 1026 |
+
{
|
| 1027 |
+
switch (properties->computeMajor) {
|
| 1028 |
+
case 2:
|
| 1029 |
+
case 3:
|
| 1030 |
+
case 4:
|
| 1031 |
+
case 5:
|
| 1032 |
+
case 6:
|
| 1033 |
+
*limit = properties->sharedMemPerBlock;
|
| 1034 |
+
break;
|
| 1035 |
+
case 7:
|
| 1036 |
+
case 8:
|
| 1037 |
+
case 9:
|
| 1038 |
+
switch (shmemLimitConfig) {
|
| 1039 |
+
default:
|
| 1040 |
+
case FUNC_SHMEM_LIMIT_DEFAULT:
|
| 1041 |
+
*limit = properties->sharedMemPerBlock;
|
| 1042 |
+
break;
|
| 1043 |
+
case FUNC_SHMEM_LIMIT_OPTIN:
|
| 1044 |
+
if (smemPerCta > properties->sharedMemPerBlock) {
|
| 1045 |
+
*limit = properties->sharedMemPerBlockOptin;
|
| 1046 |
+
}
|
| 1047 |
+
else {
|
| 1048 |
+
*limit = properties->sharedMemPerBlock;
|
| 1049 |
+
}
|
| 1050 |
+
break;
|
| 1051 |
+
}
|
| 1052 |
+
break;
|
| 1053 |
+
default:
|
| 1054 |
+
return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
|
| 1055 |
+
}
|
| 1056 |
+
|
| 1057 |
+
// Starting Ampere, CUDA driver reserves additional shared memory per block
|
| 1058 |
+
if (properties->computeMajor >= 8) {
|
| 1059 |
+
*limit += properties->reservedSharedMemPerBlock;
|
| 1060 |
+
}
|
| 1061 |
+
|
| 1062 |
+
return CUDA_OCC_SUCCESS;
|
| 1063 |
+
}
|
| 1064 |
+
|
| 1065 |
+
/**
|
| 1066 |
+
* Partitioned global caching mode support
|
| 1067 |
+
*/
|
| 1068 |
+
static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
|
| 1069 |
+
{
|
| 1070 |
+
*limit = PARTITIONED_GC_NOT_SUPPORTED;
|
| 1071 |
+
|
| 1072 |
+
if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
|
| 1073 |
+
properties->computeMajor == 6) {
|
| 1074 |
+
*limit = PARTITIONED_GC_SUPPORTED;
|
| 1075 |
+
}
|
| 1076 |
+
|
| 1077 |
+
if (properties->computeMajor == 6 && properties->computeMinor == 0) {
|
| 1078 |
+
*limit = PARTITIONED_GC_NOT_SUPPORTED;
|
| 1079 |
+
}
|
| 1080 |
+
|
| 1081 |
+
return CUDA_OCC_SUCCESS;
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
///////////////////////////////////////////////
|
| 1085 |
+
// User Input Sanity //
|
| 1086 |
+
///////////////////////////////////////////////
|
| 1087 |
+
|
| 1088 |
+
static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
|
| 1089 |
+
{
|
| 1090 |
+
// Verify device properties
|
| 1091 |
+
//
|
| 1092 |
+
// Each of these limits must be a positive number.
|
| 1093 |
+
//
|
| 1094 |
+
// Compute capacity is checked during the occupancy calculation
|
| 1095 |
+
//
|
| 1096 |
+
if (properties->maxThreadsPerBlock <= 0 ||
|
| 1097 |
+
properties->maxThreadsPerMultiprocessor <= 0 ||
|
| 1098 |
+
properties->regsPerBlock <= 0 ||
|
| 1099 |
+
properties->regsPerMultiprocessor <= 0 ||
|
| 1100 |
+
properties->warpSize <= 0 ||
|
| 1101 |
+
properties->sharedMemPerBlock <= 0 ||
|
| 1102 |
+
properties->sharedMemPerMultiprocessor <= 0 ||
|
| 1103 |
+
properties->numSms <= 0) {
|
| 1104 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1105 |
+
}
|
| 1106 |
+
|
| 1107 |
+
return CUDA_OCC_SUCCESS;
|
| 1108 |
+
}
|
| 1109 |
+
|
| 1110 |
+
static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
|
| 1111 |
+
{
|
| 1112 |
+
// Verify function attributes
|
| 1113 |
+
//
|
| 1114 |
+
if (attributes->maxThreadsPerBlock <= 0 ||
|
| 1115 |
+
attributes->numRegs < 0) { // Compiler may choose not to use
|
| 1116 |
+
// any register (empty kernels,
|
| 1117 |
+
// etc.)
|
| 1118 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1119 |
+
}
|
| 1120 |
+
|
| 1121 |
+
return CUDA_OCC_SUCCESS;
|
| 1122 |
+
}
|
| 1123 |
+
|
| 1124 |
+
static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
|
| 1125 |
+
{
|
| 1126 |
+
(void)state; // silence unused-variable warning
|
| 1127 |
+
// Placeholder
|
| 1128 |
+
//
|
| 1129 |
+
|
| 1130 |
+
return CUDA_OCC_SUCCESS;
|
| 1131 |
+
}
|
| 1132 |
+
|
| 1133 |
+
static __OCC_INLINE cudaOccError cudaOccInputCheck(
|
| 1134 |
+
const cudaOccDeviceProp *properties,
|
| 1135 |
+
const cudaOccFuncAttributes *attributes,
|
| 1136 |
+
const cudaOccDeviceState *state)
|
| 1137 |
+
{
|
| 1138 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1139 |
+
|
| 1140 |
+
status = cudaOccDevicePropCheck(properties);
|
| 1141 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1142 |
+
return status;
|
| 1143 |
+
}
|
| 1144 |
+
|
| 1145 |
+
status = cudaOccFuncAttributesCheck(attributes);
|
| 1146 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1147 |
+
return status;
|
| 1148 |
+
}
|
| 1149 |
+
|
| 1150 |
+
status = cudaOccDeviceStateCheck(state);
|
| 1151 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1152 |
+
return status;
|
| 1153 |
+
}
|
| 1154 |
+
|
| 1155 |
+
return status;
|
| 1156 |
+
}
|
| 1157 |
+
|
| 1158 |
+
///////////////////////////////////////////////
|
| 1159 |
+
// Occupancy calculation Functions //
|
| 1160 |
+
///////////////////////////////////////////////
|
| 1161 |
+
|
| 1162 |
+
static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
|
| 1163 |
+
const cudaOccDeviceProp *properties,
|
| 1164 |
+
const cudaOccFuncAttributes *attributes)
|
| 1165 |
+
{
|
| 1166 |
+
cudaOccPartitionedGCSupport gcSupport;
|
| 1167 |
+
cudaOccPartitionedGCConfig gcConfig;
|
| 1168 |
+
|
| 1169 |
+
cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
|
| 1170 |
+
|
| 1171 |
+
gcConfig = attributes->partitionedGCConfig;
|
| 1172 |
+
|
| 1173 |
+
if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
|
| 1174 |
+
gcConfig = PARTITIONED_GC_OFF;
|
| 1175 |
+
}
|
| 1176 |
+
|
| 1177 |
+
return gcConfig;
|
| 1178 |
+
}
|
| 1179 |
+
|
| 1180 |
+
// Warp limit
|
| 1181 |
+
//
|
| 1182 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
|
| 1183 |
+
int *limit,
|
| 1184 |
+
cudaOccPartitionedGCConfig gcConfig,
|
| 1185 |
+
const cudaOccDeviceProp *properties,
|
| 1186 |
+
const cudaOccFuncAttributes *attributes,
|
| 1187 |
+
int blockSize)
|
| 1188 |
+
{
|
| 1189 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1190 |
+
int maxWarpsPerSm;
|
| 1191 |
+
int warpsAllocatedPerCTA;
|
| 1192 |
+
int maxBlocks;
|
| 1193 |
+
(void)attributes; // silence unused-variable warning
|
| 1194 |
+
|
| 1195 |
+
if (blockSize > properties->maxThreadsPerBlock) {
|
| 1196 |
+
maxBlocks = 0;
|
| 1197 |
+
}
|
| 1198 |
+
else {
|
| 1199 |
+
maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
|
| 1200 |
+
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
|
| 1201 |
+
maxBlocks = 0;
|
| 1202 |
+
|
| 1203 |
+
if (gcConfig != PARTITIONED_GC_OFF) {
|
| 1204 |
+
int maxBlocksPerSmPartition;
|
| 1205 |
+
int maxWarpsPerSmPartition;
|
| 1206 |
+
|
| 1207 |
+
// If partitioned global caching is on, then a CTA can only use a SM
|
| 1208 |
+
// partition (a half SM), and thus a half of the warp slots
|
| 1209 |
+
// available per SM
|
| 1210 |
+
//
|
| 1211 |
+
maxWarpsPerSmPartition = maxWarpsPerSm / 2;
|
| 1212 |
+
maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
|
| 1213 |
+
maxBlocks = maxBlocksPerSmPartition * 2;
|
| 1214 |
+
}
|
| 1215 |
+
// On hardware that supports partitioned global caching, each half SM is
|
| 1216 |
+
// guaranteed to support at least 32 warps (maximum number of warps of a
|
| 1217 |
+
// CTA), so caching will not cause 0 occupancy due to insufficient warp
|
| 1218 |
+
// allocation slots.
|
| 1219 |
+
//
|
| 1220 |
+
else {
|
| 1221 |
+
maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
|
| 1222 |
+
}
|
| 1223 |
+
}
|
| 1224 |
+
|
| 1225 |
+
*limit = maxBlocks;
|
| 1226 |
+
|
| 1227 |
+
return status;
|
| 1228 |
+
}
|
| 1229 |
+
|
| 1230 |
+
// Shared memory limit
|
| 1231 |
+
//
|
| 1232 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
|
| 1233 |
+
int *limit,
|
| 1234 |
+
cudaOccResult *result,
|
| 1235 |
+
const cudaOccDeviceProp *properties,
|
| 1236 |
+
const cudaOccFuncAttributes *attributes,
|
| 1237 |
+
const cudaOccDeviceState *state,
|
| 1238 |
+
int blockSize,
|
| 1239 |
+
size_t dynamicSmemSize)
|
| 1240 |
+
{
|
| 1241 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1242 |
+
int allocationGranularity;
|
| 1243 |
+
size_t userSmemPreference = 0;
|
| 1244 |
+
size_t totalSmemUsagePerCTA;
|
| 1245 |
+
size_t maxSmemUsagePerCTA;
|
| 1246 |
+
size_t smemAllocatedPerCTA;
|
| 1247 |
+
size_t staticSmemSize;
|
| 1248 |
+
size_t sharedMemPerMultiprocessor;
|
| 1249 |
+
size_t smemLimitPerCTA;
|
| 1250 |
+
int maxBlocks;
|
| 1251 |
+
int dynamicSmemSizeExceeded = 0;
|
| 1252 |
+
int totalSmemSizeExceeded = 0;
|
| 1253 |
+
(void)blockSize; // silence unused-variable warning
|
| 1254 |
+
|
| 1255 |
+
status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
|
| 1256 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1257 |
+
return status;
|
| 1258 |
+
}
|
| 1259 |
+
|
| 1260 |
+
// Obtain the user preferred shared memory size. This setting is ignored if
|
| 1261 |
+
// user requests more shared memory than preferred.
|
| 1262 |
+
//
|
| 1263 |
+
status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
|
| 1264 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1265 |
+
return status;
|
| 1266 |
+
}
|
| 1267 |
+
|
| 1268 |
+
staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
|
| 1269 |
+
totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
|
| 1270 |
+
smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
|
| 1271 |
+
|
| 1272 |
+
maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
|
| 1273 |
+
|
| 1274 |
+
dynamicSmemSizeExceeded = 0;
|
| 1275 |
+
totalSmemSizeExceeded = 0;
|
| 1276 |
+
|
| 1277 |
+
// Obtain the user set maximum dynamic size if it exists
|
| 1278 |
+
// If so, the current launch dynamic shared memory must not
|
| 1279 |
+
// exceed the set limit
|
| 1280 |
+
if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
|
| 1281 |
+
dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
|
| 1282 |
+
dynamicSmemSizeExceeded = 1;
|
| 1283 |
+
}
|
| 1284 |
+
|
| 1285 |
+
status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
|
| 1286 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1287 |
+
return status;
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
if (smemAllocatedPerCTA > smemLimitPerCTA) {
|
| 1291 |
+
totalSmemSizeExceeded = 1;
|
| 1292 |
+
}
|
| 1293 |
+
|
| 1294 |
+
if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
|
| 1295 |
+
maxBlocks = 0;
|
| 1296 |
+
}
|
| 1297 |
+
else {
|
| 1298 |
+
// User requested shared memory limit is used as long as it is greater
|
| 1299 |
+
// than the total shared memory used per CTA, i.e. as long as at least
|
| 1300 |
+
// one CTA can be launched.
|
| 1301 |
+
if (userSmemPreference >= smemAllocatedPerCTA) {
|
| 1302 |
+
sharedMemPerMultiprocessor = userSmemPreference;
|
| 1303 |
+
}
|
| 1304 |
+
else {
|
| 1305 |
+
// On Volta+, user requested shared memory will limit occupancy
|
| 1306 |
+
// if it's less than shared memory per CTA. Otherwise, the
|
| 1307 |
+
// maximum shared memory limit is used.
|
| 1308 |
+
if (properties->computeMajor >= 7) {
|
| 1309 |
+
sharedMemPerMultiprocessor = smemAllocatedPerCTA;
|
| 1310 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
|
| 1311 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1312 |
+
return status;
|
| 1313 |
+
}
|
| 1314 |
+
}
|
| 1315 |
+
else {
|
| 1316 |
+
sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
|
| 1317 |
+
}
|
| 1318 |
+
}
|
| 1319 |
+
|
| 1320 |
+
if (smemAllocatedPerCTA > 0) {
|
| 1321 |
+
maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
|
| 1322 |
+
}
|
| 1323 |
+
else {
|
| 1324 |
+
maxBlocks = INT_MAX;
|
| 1325 |
+
}
|
| 1326 |
+
}
|
| 1327 |
+
|
| 1328 |
+
result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
|
| 1329 |
+
|
| 1330 |
+
*limit = maxBlocks;
|
| 1331 |
+
|
| 1332 |
+
return status;
|
| 1333 |
+
}
|
| 1334 |
+
|
| 1335 |
+
static __OCC_INLINE
|
| 1336 |
+
cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
|
| 1337 |
+
int *limit,
|
| 1338 |
+
cudaOccPartitionedGCConfig *gcConfig,
|
| 1339 |
+
cudaOccResult *result,
|
| 1340 |
+
const cudaOccDeviceProp *properties,
|
| 1341 |
+
const cudaOccFuncAttributes *attributes,
|
| 1342 |
+
int blockSize)
|
| 1343 |
+
{
|
| 1344 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1345 |
+
int allocationGranularity;
|
| 1346 |
+
int warpsAllocatedPerCTA;
|
| 1347 |
+
int regsAllocatedPerCTA;
|
| 1348 |
+
int regsAssumedPerCTA;
|
| 1349 |
+
int regsPerWarp;
|
| 1350 |
+
int regsAllocatedPerWarp;
|
| 1351 |
+
int numSubPartitions;
|
| 1352 |
+
int numRegsPerSubPartition;
|
| 1353 |
+
int numWarpsPerSubPartition;
|
| 1354 |
+
int numWarpsPerSM;
|
| 1355 |
+
int maxBlocks;
|
| 1356 |
+
int maxRegsPerThread;
|
| 1357 |
+
|
| 1358 |
+
status = cudaOccRegAllocationGranularity(
|
| 1359 |
+
&allocationGranularity,
|
| 1360 |
+
properties);
|
| 1361 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1362 |
+
return status;
|
| 1363 |
+
}
|
| 1364 |
+
|
| 1365 |
+
status = cudaOccRegAllocationMaxPerThread(
|
| 1366 |
+
&maxRegsPerThread,
|
| 1367 |
+
properties);
|
| 1368 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1369 |
+
return status;
|
| 1370 |
+
}
|
| 1371 |
+
|
| 1372 |
+
status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
|
| 1373 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1374 |
+
return status;
|
| 1375 |
+
}
|
| 1376 |
+
|
| 1377 |
+
warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
|
| 1378 |
+
|
| 1379 |
+
// GPUs of compute capability 2.x and higher allocate registers to warps
|
| 1380 |
+
//
|
| 1381 |
+
// Number of regs per warp is regs per thread x warp size, rounded up to
|
| 1382 |
+
// register allocation granularity
|
| 1383 |
+
//
|
| 1384 |
+
regsPerWarp = attributes->numRegs * properties->warpSize;
|
| 1385 |
+
regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
|
| 1386 |
+
regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;
|
| 1387 |
+
|
| 1388 |
+
// Hardware verifies if a launch fits the per-CTA register limit. For
|
| 1389 |
+
// historical reasons, the verification logic assumes register
|
| 1390 |
+
// allocations are made to all partitions simultaneously. Therefore, to
|
| 1391 |
+
// simulate the hardware check, the warp allocation needs to be rounded
|
| 1392 |
+
// up to the number of partitions.
|
| 1393 |
+
//
|
| 1394 |
+
regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
|
| 1395 |
+
|
| 1396 |
+
if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check
|
| 1397 |
+
properties->regsPerBlock < regsAllocatedPerCTA || // Software check
|
| 1398 |
+
attributes->numRegs > maxRegsPerThread) { // Per thread limit check
|
| 1399 |
+
maxBlocks = 0;
|
| 1400 |
+
}
|
| 1401 |
+
else {
|
| 1402 |
+
if (regsAllocatedPerWarp > 0) {
|
| 1403 |
+
// Registers are allocated in each sub-partition. The max number
|
| 1404 |
+
// of warps that can fit on an SM is equal to the max number of
|
| 1405 |
+
// warps per sub-partition x number of sub-partitions.
|
| 1406 |
+
//
|
| 1407 |
+
numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions;
|
| 1408 |
+
numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
|
| 1409 |
+
|
| 1410 |
+
maxBlocks = 0;
|
| 1411 |
+
|
| 1412 |
+
if (*gcConfig != PARTITIONED_GC_OFF) {
|
| 1413 |
+
int numSubPartitionsPerSmPartition;
|
| 1414 |
+
int numWarpsPerSmPartition;
|
| 1415 |
+
int maxBlocksPerSmPartition;
|
| 1416 |
+
|
| 1417 |
+
// If partitioned global caching is on, then a CTA can only
|
| 1418 |
+
// use a half SM, and thus a half of the registers available
|
| 1419 |
+
// per SM
|
| 1420 |
+
//
|
| 1421 |
+
numSubPartitionsPerSmPartition = numSubPartitions / 2;
|
| 1422 |
+
numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
|
| 1423 |
+
maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
|
| 1424 |
+
maxBlocks = maxBlocksPerSmPartition * 2;
|
| 1425 |
+
}
|
| 1426 |
+
|
| 1427 |
+
// Try again if partitioned global caching is not enabled, or if
|
| 1428 |
+
// the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter
|
| 1429 |
+
// case, the device will automatically turn off caching, except
|
| 1430 |
+
// if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
|
| 1431 |
+
// occupancy and launch configuration.
|
| 1432 |
+
//
|
| 1433 |
+
if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
|
| 1434 |
+
// In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
|
| 1435 |
+
// this is what it will be if we spread CTA across partitions.
|
| 1436 |
+
//
|
| 1437 |
+
*gcConfig = PARTITIONED_GC_OFF;
|
| 1438 |
+
numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
|
| 1439 |
+
maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
|
| 1440 |
+
}
|
| 1441 |
+
}
|
| 1442 |
+
else {
|
| 1443 |
+
maxBlocks = INT_MAX;
|
| 1444 |
+
}
|
| 1445 |
+
}
|
| 1446 |
+
|
| 1447 |
+
|
| 1448 |
+
result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
|
| 1449 |
+
|
| 1450 |
+
*limit = maxBlocks;
|
| 1451 |
+
|
| 1452 |
+
return status;
|
| 1453 |
+
}
|
| 1454 |
+
|
| 1455 |
+
// Barrier limit
|
| 1456 |
+
//
|
| 1457 |
+
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
|
| 1458 |
+
int *limit,
|
| 1459 |
+
int ctaLimitBlocks,
|
| 1460 |
+
const cudaOccFuncAttributes *attributes)
|
| 1461 |
+
{
|
| 1462 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1463 |
+
int numBarriersAvailable = ctaLimitBlocks * 2;
|
| 1464 |
+
int numBarriersUsed = attributes->numBlockBarriers;
|
| 1465 |
+
int maxBlocks = INT_MAX;
|
| 1466 |
+
|
| 1467 |
+
if (numBarriersUsed) {
|
| 1468 |
+
maxBlocks = numBarriersAvailable / numBarriersUsed;
|
| 1469 |
+
}
|
| 1470 |
+
|
| 1471 |
+
*limit = maxBlocks;
|
| 1472 |
+
|
| 1473 |
+
return status;
|
| 1474 |
+
}
|
| 1475 |
+
|
| 1476 |
+
///////////////////////////////////
|
| 1477 |
+
// API Implementations //
|
| 1478 |
+
///////////////////////////////////
|
| 1479 |
+
|
| 1480 |
+
static __OCC_INLINE
|
| 1481 |
+
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1482 |
+
cudaOccResult *result,
|
| 1483 |
+
const cudaOccDeviceProp *properties,
|
| 1484 |
+
const cudaOccFuncAttributes *attributes,
|
| 1485 |
+
const cudaOccDeviceState *state,
|
| 1486 |
+
int blockSize,
|
| 1487 |
+
size_t dynamicSmemSize)
|
| 1488 |
+
{
|
| 1489 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1490 |
+
int ctaLimitWarps = 0;
|
| 1491 |
+
int ctaLimitBlocks = 0;
|
| 1492 |
+
int ctaLimitSMem = 0;
|
| 1493 |
+
int ctaLimitRegs = 0;
|
| 1494 |
+
int ctaLimitBars = 0;
|
| 1495 |
+
int ctaLimit = 0;
|
| 1496 |
+
unsigned int limitingFactors = 0;
|
| 1497 |
+
|
| 1498 |
+
cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
|
| 1499 |
+
|
| 1500 |
+
if (!result || !properties || !attributes || !state || blockSize <= 0) {
|
| 1501 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1502 |
+
}
|
| 1503 |
+
|
| 1504 |
+
///////////////////////////
|
| 1505 |
+
// Check user input
|
| 1506 |
+
///////////////////////////
|
| 1507 |
+
|
| 1508 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1509 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1510 |
+
return status;
|
| 1511 |
+
}
|
| 1512 |
+
|
| 1513 |
+
///////////////////////////
|
| 1514 |
+
// Initialization
|
| 1515 |
+
///////////////////////////
|
| 1516 |
+
|
| 1517 |
+
gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
|
| 1518 |
+
|
| 1519 |
+
///////////////////////////
|
| 1520 |
+
// Compute occupancy
|
| 1521 |
+
///////////////////////////
|
| 1522 |
+
|
| 1523 |
+
// Limits due to registers/SM
|
| 1524 |
+
// Also compute if partitioned global caching has to be turned off
|
| 1525 |
+
//
|
| 1526 |
+
status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
|
| 1527 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1528 |
+
return status;
|
| 1529 |
+
}
|
| 1530 |
+
|
| 1531 |
+
// SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
|
| 1532 |
+
// As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
|
| 1533 |
+
// For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
|
| 1534 |
+
// we do not let it run on any Pascal processor, even though it may be able to run on GP100.
|
| 1535 |
+
// Therefore, we check the occupancy on GP10x when it can run on GP100
|
| 1536 |
+
//
|
| 1537 |
+
if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
|
| 1538 |
+
cudaOccDeviceProp propertiesGP10x;
|
| 1539 |
+
cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
|
| 1540 |
+
int ctaLimitRegsGP10x = 0;
|
| 1541 |
+
|
| 1542 |
+
// Set up properties for GP10x
|
| 1543 |
+
memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
|
| 1544 |
+
propertiesGP10x.computeMinor = 1;
|
| 1545 |
+
|
| 1546 |
+
status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
|
| 1547 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1548 |
+
return status;
|
| 1549 |
+
}
|
| 1550 |
+
|
| 1551 |
+
if (ctaLimitRegsGP10x == 0) {
|
| 1552 |
+
ctaLimitRegs = 0;
|
| 1553 |
+
}
|
| 1554 |
+
}
|
| 1555 |
+
|
| 1556 |
+
// Limits due to warps/SM
|
| 1557 |
+
//
|
| 1558 |
+
status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
|
| 1559 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1560 |
+
return status;
|
| 1561 |
+
}
|
| 1562 |
+
|
| 1563 |
+
// Limits due to blocks/SM
|
| 1564 |
+
//
|
| 1565 |
+
status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
|
| 1566 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1567 |
+
return status;
|
| 1568 |
+
}
|
| 1569 |
+
|
| 1570 |
+
// Limits due to shared memory/SM
|
| 1571 |
+
//
|
| 1572 |
+
status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
|
| 1573 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1574 |
+
return status;
|
| 1575 |
+
}
|
| 1576 |
+
|
| 1577 |
+
///////////////////////////
|
| 1578 |
+
// Overall occupancy
|
| 1579 |
+
///////////////////////////
|
| 1580 |
+
|
| 1581 |
+
// Overall limit is min() of limits due to above reasons
|
| 1582 |
+
//
|
| 1583 |
+
ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
|
| 1584 |
+
|
| 1585 |
+
// Determine occupancy limiting factors
|
| 1586 |
+
//
|
| 1587 |
+
if (ctaLimit == ctaLimitWarps) {
|
| 1588 |
+
limitingFactors |= OCC_LIMIT_WARPS;
|
| 1589 |
+
}
|
| 1590 |
+
if (ctaLimit == ctaLimitRegs) {
|
| 1591 |
+
limitingFactors |= OCC_LIMIT_REGISTERS;
|
| 1592 |
+
}
|
| 1593 |
+
if (ctaLimit == ctaLimitSMem) {
|
| 1594 |
+
limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
|
| 1595 |
+
}
|
| 1596 |
+
if (ctaLimit == ctaLimitBlocks) {
|
| 1597 |
+
limitingFactors |= OCC_LIMIT_BLOCKS;
|
| 1598 |
+
}
|
| 1599 |
+
|
| 1600 |
+
// For Hopper onwards compute the limits to occupancy based on block barrier count
|
| 1601 |
+
//
|
| 1602 |
+
if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
|
| 1603 |
+
// Limits due to barrier/SM
|
| 1604 |
+
//
|
| 1605 |
+
status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
|
| 1606 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1607 |
+
return status;
|
| 1608 |
+
}
|
| 1609 |
+
|
| 1610 |
+
// Recompute overall limit based on barrier/SM
|
| 1611 |
+
//
|
| 1612 |
+
ctaLimit = __occMin(ctaLimitBars, ctaLimit);
|
| 1613 |
+
|
| 1614 |
+
// Determine if this is occupancy limiting factor
|
| 1615 |
+
//
|
| 1616 |
+
if (ctaLimit == ctaLimitBars) {
|
| 1617 |
+
limitingFactors |= OCC_LIMIT_BARRIERS;
|
| 1618 |
+
}
|
| 1619 |
+
}
|
| 1620 |
+
else {
|
| 1621 |
+
ctaLimitBars = INT_MAX;
|
| 1622 |
+
}
|
| 1623 |
+
|
| 1624 |
+
// Fill in the return values
|
| 1625 |
+
//
|
| 1626 |
+
result->limitingFactors = limitingFactors;
|
| 1627 |
+
|
| 1628 |
+
result->blockLimitRegs = ctaLimitRegs;
|
| 1629 |
+
result->blockLimitSharedMem = ctaLimitSMem;
|
| 1630 |
+
result->blockLimitWarps = ctaLimitWarps;
|
| 1631 |
+
result->blockLimitBlocks = ctaLimitBlocks;
|
| 1632 |
+
result->blockLimitBarriers = ctaLimitBars;
|
| 1633 |
+
result->partitionedGCConfig = gcConfig;
|
| 1634 |
+
|
| 1635 |
+
// Final occupancy
|
| 1636 |
+
result->activeBlocksPerMultiprocessor = ctaLimit;
|
| 1637 |
+
|
| 1638 |
+
return CUDA_OCC_SUCCESS;
|
| 1639 |
+
}
|
| 1640 |
+
|
| 1641 |
+
static __OCC_INLINE
|
| 1642 |
+
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
|
| 1643 |
+
size_t *bytesAvailable,
|
| 1644 |
+
const cudaOccDeviceProp *properties,
|
| 1645 |
+
const cudaOccFuncAttributes *attributes,
|
| 1646 |
+
const cudaOccDeviceState *state,
|
| 1647 |
+
int numBlocks,
|
| 1648 |
+
int blockSize)
|
| 1649 |
+
{
|
| 1650 |
+
int allocationGranularity;
|
| 1651 |
+
size_t smemLimitPerBlock;
|
| 1652 |
+
size_t smemAvailableForDynamic;
|
| 1653 |
+
size_t userSmemPreference = 0;
|
| 1654 |
+
size_t sharedMemPerMultiprocessor;
|
| 1655 |
+
cudaOccResult result;
|
| 1656 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1657 |
+
|
| 1658 |
+
if (numBlocks <= 0)
|
| 1659 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1660 |
+
|
| 1661 |
+
// First compute occupancy of potential kernel launch.
|
| 1662 |
+
//
|
| 1663 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
|
| 1664 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1665 |
+
return status;
|
| 1666 |
+
}
|
| 1667 |
+
// Check if occupancy is achievable given user requested number of blocks.
|
| 1668 |
+
//
|
| 1669 |
+
if (result.activeBlocksPerMultiprocessor < numBlocks) {
|
| 1670 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1671 |
+
}
|
| 1672 |
+
|
| 1673 |
+
status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
|
| 1674 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1675 |
+
return status;
|
| 1676 |
+
}
|
| 1677 |
+
|
| 1678 |
+
// Return the per block shared memory limit based on function config.
|
| 1679 |
+
//
|
| 1680 |
+
status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
|
| 1681 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1682 |
+
return status;
|
| 1683 |
+
}
|
| 1684 |
+
|
| 1685 |
+
// If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
|
| 1686 |
+
// limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
|
| 1687 |
+
// preference sets the total limit of available shared memory.
|
| 1688 |
+
//
|
| 1689 |
+
cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
|
| 1690 |
+
if (numBlocks == 1) {
|
| 1691 |
+
sharedMemPerMultiprocessor = smemLimitPerBlock;
|
| 1692 |
+
}
|
| 1693 |
+
else {
|
| 1694 |
+
if (!userSmemPreference) {
|
| 1695 |
+
userSmemPreference = 1 ;
|
| 1696 |
+
status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
|
| 1697 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1698 |
+
return status;
|
| 1699 |
+
}
|
| 1700 |
+
}
|
| 1701 |
+
sharedMemPerMultiprocessor = userSmemPreference;
|
| 1702 |
+
}
|
| 1703 |
+
|
| 1704 |
+
// Compute total shared memory available per SM
|
| 1705 |
+
//
|
| 1706 |
+
smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks;
|
| 1707 |
+
smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
|
| 1708 |
+
|
| 1709 |
+
// Cap shared memory
|
| 1710 |
+
//
|
| 1711 |
+
if (smemAvailableForDynamic > smemLimitPerBlock) {
|
| 1712 |
+
smemAvailableForDynamic = smemLimitPerBlock;
|
| 1713 |
+
}
|
| 1714 |
+
|
| 1715 |
+
// Now compute dynamic shared memory size
|
| 1716 |
+
smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
|
| 1717 |
+
|
| 1718 |
+
// Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
|
| 1719 |
+
//
|
| 1720 |
+
if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
|
| 1721 |
+
smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
|
| 1722 |
+
|
| 1723 |
+
*bytesAvailable = smemAvailableForDynamic;
|
| 1724 |
+
return CUDA_OCC_SUCCESS;
|
| 1725 |
+
}
|
| 1726 |
+
|
| 1727 |
+
static __OCC_INLINE
|
| 1728 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 1729 |
+
int *minGridSize,
|
| 1730 |
+
int *blockSize,
|
| 1731 |
+
const cudaOccDeviceProp *properties,
|
| 1732 |
+
const cudaOccFuncAttributes *attributes,
|
| 1733 |
+
const cudaOccDeviceState *state,
|
| 1734 |
+
size_t (*blockSizeToDynamicSMemSize)(int),
|
| 1735 |
+
size_t dynamicSMemSize)
|
| 1736 |
+
{
|
| 1737 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1738 |
+
cudaOccResult result;
|
| 1739 |
+
|
| 1740 |
+
// Limits
|
| 1741 |
+
int occupancyLimit;
|
| 1742 |
+
int granularity;
|
| 1743 |
+
int blockSizeLimit;
|
| 1744 |
+
|
| 1745 |
+
// Recorded maximum
|
| 1746 |
+
int maxBlockSize = 0;
|
| 1747 |
+
int numBlocks = 0;
|
| 1748 |
+
int maxOccupancy = 0;
|
| 1749 |
+
|
| 1750 |
+
// Temporary
|
| 1751 |
+
int blockSizeToTryAligned;
|
| 1752 |
+
int blockSizeToTry;
|
| 1753 |
+
int blockSizeLimitAligned;
|
| 1754 |
+
int occupancyInBlocks;
|
| 1755 |
+
int occupancyInThreads;
|
| 1756 |
+
|
| 1757 |
+
///////////////////////////
|
| 1758 |
+
// Check user input
|
| 1759 |
+
///////////////////////////
|
| 1760 |
+
|
| 1761 |
+
if (!minGridSize || !blockSize || !properties || !attributes || !state) {
|
| 1762 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1763 |
+
}
|
| 1764 |
+
|
| 1765 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1766 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1767 |
+
return status;
|
| 1768 |
+
}
|
| 1769 |
+
|
| 1770 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1771 |
+
// Try each block size, and pick the block size with maximum occupancy
|
| 1772 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1773 |
+
|
| 1774 |
+
occupancyLimit = properties->maxThreadsPerMultiprocessor;
|
| 1775 |
+
granularity = properties->warpSize;
|
| 1776 |
+
|
| 1777 |
+
blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
|
| 1778 |
+
blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
|
| 1779 |
+
|
| 1780 |
+
for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
|
| 1781 |
+
blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
|
| 1782 |
+
|
| 1783 |
+
// Ignore dynamicSMemSize if the user provides a mapping
|
| 1784 |
+
//
|
| 1785 |
+
if (blockSizeToDynamicSMemSize) {
|
| 1786 |
+
dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
|
| 1787 |
+
}
|
| 1788 |
+
|
| 1789 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1790 |
+
&result,
|
| 1791 |
+
properties,
|
| 1792 |
+
attributes,
|
| 1793 |
+
state,
|
| 1794 |
+
blockSizeToTry,
|
| 1795 |
+
dynamicSMemSize);
|
| 1796 |
+
|
| 1797 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1798 |
+
return status;
|
| 1799 |
+
}
|
| 1800 |
+
|
| 1801 |
+
occupancyInBlocks = result.activeBlocksPerMultiprocessor;
|
| 1802 |
+
occupancyInThreads = blockSizeToTry * occupancyInBlocks;
|
| 1803 |
+
|
| 1804 |
+
if (occupancyInThreads > maxOccupancy) {
|
| 1805 |
+
maxBlockSize = blockSizeToTry;
|
| 1806 |
+
numBlocks = occupancyInBlocks;
|
| 1807 |
+
maxOccupancy = occupancyInThreads;
|
| 1808 |
+
}
|
| 1809 |
+
|
| 1810 |
+
// Early out if we have reached the maximum
|
| 1811 |
+
//
|
| 1812 |
+
if (occupancyLimit == maxOccupancy) {
|
| 1813 |
+
break;
|
| 1814 |
+
}
|
| 1815 |
+
}
|
| 1816 |
+
|
| 1817 |
+
///////////////////////////
|
| 1818 |
+
// Return best available
|
| 1819 |
+
///////////////////////////
|
| 1820 |
+
|
| 1821 |
+
// Suggested min grid size to achieve a full machine launch
|
| 1822 |
+
//
|
| 1823 |
+
*minGridSize = numBlocks * properties->numSms;
|
| 1824 |
+
*blockSize = maxBlockSize;
|
| 1825 |
+
|
| 1826 |
+
return status;
|
| 1827 |
+
}
|
| 1828 |
+
|
| 1829 |
+
|
| 1830 |
+
#if defined(__cplusplus)
|
| 1831 |
+
|
| 1832 |
+
namespace {
|
| 1833 |
+
|
| 1834 |
+
__OCC_INLINE
|
| 1835 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
|
| 1836 |
+
int *minGridSize,
|
| 1837 |
+
int *blockSize,
|
| 1838 |
+
const cudaOccDeviceProp *properties,
|
| 1839 |
+
const cudaOccFuncAttributes *attributes,
|
| 1840 |
+
const cudaOccDeviceState *state,
|
| 1841 |
+
size_t dynamicSMemSize)
|
| 1842 |
+
{
|
| 1843 |
+
return cudaOccMaxPotentialOccupancyBlockSize(
|
| 1844 |
+
minGridSize,
|
| 1845 |
+
blockSize,
|
| 1846 |
+
properties,
|
| 1847 |
+
attributes,
|
| 1848 |
+
state,
|
| 1849 |
+
NULL,
|
| 1850 |
+
dynamicSMemSize);
|
| 1851 |
+
}
|
| 1852 |
+
|
| 1853 |
+
template <typename UnaryFunction>
|
| 1854 |
+
__OCC_INLINE
|
| 1855 |
+
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
|
| 1856 |
+
int *minGridSize,
|
| 1857 |
+
int *blockSize,
|
| 1858 |
+
const cudaOccDeviceProp *properties,
|
| 1859 |
+
const cudaOccFuncAttributes *attributes,
|
| 1860 |
+
const cudaOccDeviceState *state,
|
| 1861 |
+
UnaryFunction blockSizeToDynamicSMemSize)
|
| 1862 |
+
{
|
| 1863 |
+
cudaOccError status = CUDA_OCC_SUCCESS;
|
| 1864 |
+
cudaOccResult result;
|
| 1865 |
+
|
| 1866 |
+
// Limits
|
| 1867 |
+
int occupancyLimit;
|
| 1868 |
+
int granularity;
|
| 1869 |
+
int blockSizeLimit;
|
| 1870 |
+
|
| 1871 |
+
// Recorded maximum
|
| 1872 |
+
int maxBlockSize = 0;
|
| 1873 |
+
int numBlocks = 0;
|
| 1874 |
+
int maxOccupancy = 0;
|
| 1875 |
+
|
| 1876 |
+
// Temporary
|
| 1877 |
+
int blockSizeToTryAligned;
|
| 1878 |
+
int blockSizeToTry;
|
| 1879 |
+
int blockSizeLimitAligned;
|
| 1880 |
+
int occupancyInBlocks;
|
| 1881 |
+
int occupancyInThreads;
|
| 1882 |
+
size_t dynamicSMemSize;
|
| 1883 |
+
|
| 1884 |
+
///////////////////////////
|
| 1885 |
+
// Check user input
|
| 1886 |
+
///////////////////////////
|
| 1887 |
+
|
| 1888 |
+
if (!minGridSize || !blockSize || !properties || !attributes || !state) {
|
| 1889 |
+
return CUDA_OCC_ERROR_INVALID_INPUT;
|
| 1890 |
+
}
|
| 1891 |
+
|
| 1892 |
+
status = cudaOccInputCheck(properties, attributes, state);
|
| 1893 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1894 |
+
return status;
|
| 1895 |
+
}
|
| 1896 |
+
|
| 1897 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1898 |
+
// Try each block size, and pick the block size with maximum occupancy
|
| 1899 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 1900 |
+
|
| 1901 |
+
occupancyLimit = properties->maxThreadsPerMultiprocessor;
|
| 1902 |
+
granularity = properties->warpSize;
|
| 1903 |
+
blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
|
| 1904 |
+
blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
|
| 1905 |
+
|
| 1906 |
+
for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
|
| 1907 |
+
blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
|
| 1908 |
+
|
| 1909 |
+
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
|
| 1910 |
+
|
| 1911 |
+
status = cudaOccMaxActiveBlocksPerMultiprocessor(
|
| 1912 |
+
&result,
|
| 1913 |
+
properties,
|
| 1914 |
+
attributes,
|
| 1915 |
+
state,
|
| 1916 |
+
blockSizeToTry,
|
| 1917 |
+
dynamicSMemSize);
|
| 1918 |
+
|
| 1919 |
+
if (status != CUDA_OCC_SUCCESS) {
|
| 1920 |
+
return status;
|
| 1921 |
+
}
|
| 1922 |
+
|
| 1923 |
+
occupancyInBlocks = result.activeBlocksPerMultiprocessor;
|
| 1924 |
+
|
| 1925 |
+
occupancyInThreads = blockSizeToTry * occupancyInBlocks;
|
| 1926 |
+
|
| 1927 |
+
if (occupancyInThreads > maxOccupancy) {
|
| 1928 |
+
maxBlockSize = blockSizeToTry;
|
| 1929 |
+
numBlocks = occupancyInBlocks;
|
| 1930 |
+
maxOccupancy = occupancyInThreads;
|
| 1931 |
+
}
|
| 1932 |
+
|
| 1933 |
+
// Early out if we have reached the maximum
|
| 1934 |
+
//
|
| 1935 |
+
if (occupancyLimit == maxOccupancy) {
|
| 1936 |
+
break;
|
| 1937 |
+
}
|
| 1938 |
+
}
|
| 1939 |
+
|
| 1940 |
+
///////////////////////////
|
| 1941 |
+
// Return best available
|
| 1942 |
+
///////////////////////////
|
| 1943 |
+
|
| 1944 |
+
// Suggested min grid size to achieve a full machine launch
|
| 1945 |
+
//
|
| 1946 |
+
*minGridSize = numBlocks * properties->numSms;
|
| 1947 |
+
*blockSize = maxBlockSize;
|
| 1948 |
+
|
| 1949 |
+
return status;
|
| 1950 |
+
}
|
| 1951 |
+
|
| 1952 |
+
} // namespace anonymous
|
| 1953 |
+
|
| 1954 |
+
#endif /*__cplusplus */
|
| 1955 |
+
|
| 1956 |
+
#undef __OCC_INLINE
|
| 1957 |
+
|
| 1958 |
+
#endif /*__cuda_occupancy_h__*/
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_H_
|
| 51 |
+
# define _CUDA_PIPELINE_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_pipeline_primitives.h"
|
| 54 |
+
|
| 55 |
+
# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
|
| 56 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 57 |
+
-std=c++11 compiler option.
|
| 58 |
+
# endif
|
| 59 |
+
|
| 60 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 61 |
+
# include "cuda_awbarrier.h"
|
| 62 |
+
# endif
|
| 63 |
+
|
| 64 |
+
// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
|
| 65 |
+
|
| 66 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 67 |
+
# if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
|
| 68 |
+
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
|
| 69 |
+
# else
|
| 70 |
+
# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
|
| 71 |
+
# endif
|
| 72 |
+
|
| 73 |
+
# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
|
| 74 |
+
# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
|
| 75 |
+
# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
|
| 76 |
+
|
| 77 |
+
namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
|
| 78 |
+
struct __block_scope_barrier_base;
|
| 79 |
+
}}
|
| 80 |
+
|
| 81 |
+
# endif
|
| 82 |
+
|
| 83 |
+
_CUDA_PIPELINE_BEGIN_NAMESPACE
|
| 84 |
+
|
| 85 |
+
template<size_t N, typename T>
|
| 86 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 87 |
+
auto segment(T* ptr) -> T(*)[N];
|
| 88 |
+
|
| 89 |
+
class pipeline {
|
| 90 |
+
public:
|
| 91 |
+
pipeline(const pipeline&) = delete;
|
| 92 |
+
pipeline(pipeline&&) = delete;
|
| 93 |
+
pipeline& operator=(const pipeline&) = delete;
|
| 94 |
+
pipeline& operator=(pipeline&&) = delete;
|
| 95 |
+
|
| 96 |
+
_CUDA_PIPELINE_QUALIFIER pipeline();
|
| 97 |
+
_CUDA_PIPELINE_QUALIFIER size_t commit();
|
| 98 |
+
_CUDA_PIPELINE_QUALIFIER void commit_and_wait();
|
| 99 |
+
_CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
|
| 100 |
+
template<unsigned N>
|
| 101 |
+
_CUDA_PIPELINE_QUALIFIER void wait_prior();
|
| 102 |
+
|
| 103 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 104 |
+
_CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
|
| 105 |
+
_CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
|
| 106 |
+
# endif
|
| 107 |
+
|
| 108 |
+
private:
|
| 109 |
+
size_t current_batch;
|
| 110 |
+
};
|
| 111 |
+
|
| 112 |
+
template<class T>
|
| 113 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 114 |
+
void memcpy_async(T& dst, const T& src, pipeline& pipe);
|
| 115 |
+
|
| 116 |
+
template<class T, size_t DstN, size_t SrcN>
|
| 117 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 118 |
+
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
|
| 119 |
+
|
| 120 |
+
template<size_t N, typename T>
|
| 121 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 122 |
+
auto segment(T* ptr) -> T(*)[N]
|
| 123 |
+
{
|
| 124 |
+
return (T(*)[N])ptr;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 128 |
+
pipeline::pipeline()
|
| 129 |
+
: current_batch(0)
|
| 130 |
+
{
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 134 |
+
size_t pipeline::commit()
|
| 135 |
+
{
|
| 136 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
|
| 137 |
+
return this->current_batch++;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 141 |
+
void pipeline::commit_and_wait()
|
| 142 |
+
{
|
| 143 |
+
(void)pipeline::commit();
|
| 144 |
+
pipeline::wait_prior<0>();
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 148 |
+
void pipeline::wait(size_t batch)
|
| 149 |
+
{
|
| 150 |
+
const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
|
| 151 |
+
|
| 152 |
+
switch (prior) {
|
| 153 |
+
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
|
| 154 |
+
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
|
| 155 |
+
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
|
| 156 |
+
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
|
| 157 |
+
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
|
| 158 |
+
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
|
| 159 |
+
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
|
| 160 |
+
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
|
| 161 |
+
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
template<unsigned N>
|
| 166 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 167 |
+
void pipeline::wait_prior()
|
| 168 |
+
{
|
| 169 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 173 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 174 |
+
void pipeline::arrive_on(awbarrier& barrier)
|
| 175 |
+
{
|
| 176 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 180 |
+
void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
|
| 181 |
+
{
|
| 182 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
|
| 183 |
+
}
|
| 184 |
+
# endif
|
| 185 |
+
|
| 186 |
+
template<class T>
|
| 187 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 188 |
+
void memcpy_async(T& dst, const T& src, pipeline& pipe)
|
| 189 |
+
{
|
| 190 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
|
| 191 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
|
| 192 |
+
|
| 193 |
+
if (__is_trivially_copyable(T)) {
|
| 194 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
|
| 195 |
+
reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
|
| 196 |
+
} else {
|
| 197 |
+
dst = src;
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
template<class T, size_t DstN, size_t SrcN>
|
| 202 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 203 |
+
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
|
| 204 |
+
{
|
| 205 |
+
constexpr size_t dst_size = sizeof(*dst);
|
| 206 |
+
constexpr size_t src_size = sizeof(*src);
|
| 207 |
+
static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
|
| 208 |
+
static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
|
| 209 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
|
| 210 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
|
| 211 |
+
|
| 212 |
+
if (__is_trivially_copyable(T)) {
|
| 213 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
|
| 214 |
+
reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
|
| 215 |
+
} else {
|
| 216 |
+
for (size_t i = 0; i < DstN; ++i) {
|
| 217 |
+
(*dst)[i] = (i < SrcN) ? (*src)[i] : T();
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CUDA_PIPELINE_END_NAMESPACE
|
| 223 |
+
|
| 224 |
+
#endif /* !_CUDA_PIPELINE_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_TEXTURE_TYPES_H__)
|
| 51 |
+
#define __CUDA_TEXTURE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDACC_RTC__)
|
| 62 |
+
#define EXCLUDE_FROM_RTC
|
| 63 |
+
#include "channel_descriptor.h"
|
| 64 |
+
#undef EXCLUDE_FROM_RTC
|
| 65 |
+
#endif /* !__CUDACC_RTC__ */
|
| 66 |
+
#include "cuda_runtime_api.h"
|
| 67 |
+
|
| 68 |
+
/*******************************************************************************
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
*******************************************************************************/
|
| 73 |
+
|
| 74 |
+
template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
|
| 75 |
+
struct __device_builtin_texture_type__ texture : public textureReference
|
| 76 |
+
{
|
| 77 |
+
#if !defined(__CUDACC_RTC__)
|
| 78 |
+
__host__ texture(int norm = 0,
|
| 79 |
+
enum cudaTextureFilterMode fMode = cudaFilterModePoint,
|
| 80 |
+
enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
|
| 81 |
+
{
|
| 82 |
+
normalized = norm;
|
| 83 |
+
filterMode = fMode;
|
| 84 |
+
addressMode[0] = aMode;
|
| 85 |
+
addressMode[1] = aMode;
|
| 86 |
+
addressMode[2] = aMode;
|
| 87 |
+
channelDesc = cudaCreateChannelDesc<T>();
|
| 88 |
+
sRGB = 0;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
__host__ texture(int norm,
|
| 92 |
+
enum cudaTextureFilterMode fMode,
|
| 93 |
+
enum cudaTextureAddressMode aMode,
|
| 94 |
+
struct cudaChannelFormatDesc desc)
|
| 95 |
+
{
|
| 96 |
+
normalized = norm;
|
| 97 |
+
filterMode = fMode;
|
| 98 |
+
addressMode[0] = aMode;
|
| 99 |
+
addressMode[1] = aMode;
|
| 100 |
+
addressMode[2] = aMode;
|
| 101 |
+
channelDesc = desc;
|
| 102 |
+
sRGB = 0;
|
| 103 |
+
}
|
| 104 |
+
#endif /* !__CUDACC_RTC__ */
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 108 |
+
|
| 109 |
+
#endif /* !__CUDA_TEXTURE_TYPES_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
|
| 51 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
/*******************************************************************************
|
| 62 |
+
* *
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
*******************************************************************************/
|
| 66 |
+
|
| 67 |
+
#include "cuda_runtime_api.h"
|
| 68 |
+
|
| 69 |
+
/*******************************************************************************
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
* *
|
| 73 |
+
*******************************************************************************/
|
| 74 |
+
|
| 75 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
|
| 76 |
+
{
|
| 77 |
+
return __iAtomicAdd(address, val);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
|
| 81 |
+
{
|
| 82 |
+
return __uAtomicAdd(address, val);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
|
| 86 |
+
{
|
| 87 |
+
return __iAtomicAdd(address, (unsigned int)-(int)val);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
|
| 91 |
+
{
|
| 92 |
+
return __uAtomicAdd(address, (unsigned int)-(int)val);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
|
| 96 |
+
{
|
| 97 |
+
return __iAtomicExch(address, val);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
|
| 101 |
+
{
|
| 102 |
+
return __uAtomicExch(address, val);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
|
| 106 |
+
{
|
| 107 |
+
return __fAtomicExch(address, val);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
|
| 111 |
+
{
|
| 112 |
+
return __iAtomicMin(address, val);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
|
| 116 |
+
{
|
| 117 |
+
return __uAtomicMin(address, val);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
|
| 121 |
+
{
|
| 122 |
+
return __iAtomicMax(address, val);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
|
| 126 |
+
{
|
| 127 |
+
return __uAtomicMax(address, val);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
|
| 131 |
+
{
|
| 132 |
+
return __uAtomicInc(address, val);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
|
| 136 |
+
{
|
| 137 |
+
return __uAtomicDec(address, val);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
|
| 141 |
+
{
|
| 142 |
+
return __iAtomicAnd(address, val);
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
|
| 146 |
+
{
|
| 147 |
+
return __uAtomicAnd(address, val);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
|
| 151 |
+
{
|
| 152 |
+
return __iAtomicOr(address, val);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
|
| 156 |
+
{
|
| 157 |
+
return __uAtomicOr(address, val);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
|
| 161 |
+
{
|
| 162 |
+
return __iAtomicXor(address, val);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
|
| 166 |
+
{
|
| 167 |
+
return __uAtomicXor(address, val);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
|
| 171 |
+
{
|
| 172 |
+
return __iAtomicCAS(address, compare, val);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
|
| 176 |
+
{
|
| 177 |
+
return __uAtomicCAS(address, compare, val);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
/*******************************************************************************
|
| 181 |
+
* *
|
| 182 |
+
* *
|
| 183 |
+
* *
|
| 184 |
+
*******************************************************************************/
|
| 185 |
+
|
| 186 |
+
#include "cuda_runtime_api.h"
|
| 187 |
+
|
| 188 |
+
/*******************************************************************************
|
| 189 |
+
* *
|
| 190 |
+
* *
|
| 191 |
+
* *
|
| 192 |
+
*******************************************************************************/
|
| 193 |
+
|
| 194 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
|
| 195 |
+
{
|
| 196 |
+
return __ullAtomicAdd(address, val);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
|
| 200 |
+
{
|
| 201 |
+
return __ullAtomicExch(address, val);
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
|
| 205 |
+
{
|
| 206 |
+
return __ullAtomicCAS(address, compare, val);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
|
| 210 |
+
{
|
| 211 |
+
return (bool)__any((int)cond);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
|
| 215 |
+
{
|
| 216 |
+
return (bool)__all((int)cond);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 220 |
+
|
| 221 |
+
#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
|
| 222 |
+
|
| 223 |
+
#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
|
| 224 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/device_double_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
|
| 51 |
+
#define __DEVICE_LAUNCH_PARAMETERS_H__
|
| 52 |
+
|
| 53 |
+
#include "vector_types.h"
|
| 54 |
+
|
| 55 |
+
#if !defined(__STORAGE__)
|
| 56 |
+
|
| 57 |
+
#if defined(__CUDACC_RTC__)
|
| 58 |
+
#define __STORAGE__ \
|
| 59 |
+
extern const __device__
|
| 60 |
+
#else /* !__CUDACC_RTC__ */
|
| 61 |
+
#define __STORAGE__ \
|
| 62 |
+
extern const
|
| 63 |
+
#endif /* __CUDACC_RTC__ */
|
| 64 |
+
|
| 65 |
+
#endif /* __STORAGE__ */
|
| 66 |
+
|
| 67 |
+
#if defined(__cplusplus)
|
| 68 |
+
extern "C" {
|
| 69 |
+
#endif /* __cplusplus */
|
| 70 |
+
|
| 71 |
+
uint3 __device_builtin__ __STORAGE__ threadIdx;
|
| 72 |
+
uint3 __device_builtin__ __STORAGE__ blockIdx;
|
| 73 |
+
dim3 __device_builtin__ __STORAGE__ blockDim;
|
| 74 |
+
dim3 __device_builtin__ __STORAGE__ gridDim;
|
| 75 |
+
int __device_builtin__ __STORAGE__ warpSize;
|
| 76 |
+
|
| 77 |
+
#undef __STORAGE__
|
| 78 |
+
|
| 79 |
+
#if defined(__cplusplus)
|
| 80 |
+
}
|
| 81 |
+
#endif /* __cplusplus */
|
| 82 |
+
|
| 83 |
+
#if !defined(__cudaGet_threadIdx)
|
| 84 |
+
|
| 85 |
+
#define __cudaGet_threadIdx() \
|
| 86 |
+
threadIdx
|
| 87 |
+
|
| 88 |
+
#endif /* __cudaGet_threadIdx */
|
| 89 |
+
|
| 90 |
+
#if !defined(__cudaGet_blockIdx)
|
| 91 |
+
|
| 92 |
+
#define __cudaGet_blockIdx() \
|
| 93 |
+
blockIdx
|
| 94 |
+
|
| 95 |
+
#endif /* __cudaGet_blockIdx */
|
| 96 |
+
|
| 97 |
+
#if !defined(__cudaGet_blockDim)
|
| 98 |
+
|
| 99 |
+
#define __cudaGet_blockDim() \
|
| 100 |
+
blockDim
|
| 101 |
+
|
| 102 |
+
#endif /* __cudaGet_blockDim */
|
| 103 |
+
|
| 104 |
+
#if !defined(__cudaGet_gridDim)
|
| 105 |
+
|
| 106 |
+
#define __cudaGet_gridDim() \
|
| 107 |
+
gridDim
|
| 108 |
+
|
| 109 |
+
#endif /* __cudaGet_gridDim */
|
| 110 |
+
|
| 111 |
+
#if !defined(__cudaGet_warpSize)
|
| 112 |
+
|
| 113 |
+
#define __cudaGet_warpSize() \
|
| 114 |
+
warpSize
|
| 115 |
+
|
| 116 |
+
#endif /* __cudaGet_warpSize */
|
| 117 |
+
|
| 118 |
+
#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DRIVER_FUNCTIONS_H__)
|
| 51 |
+
#define __DRIVER_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
#include "builtin_types.h"
|
| 54 |
+
#include "crt/host_defines.h"
|
| 55 |
+
#include "driver_types.h"
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
* \addtogroup CUDART_MEMORY
|
| 59 |
+
*
|
| 60 |
+
* @{
|
| 61 |
+
*/
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* \brief Returns a cudaPitchedPtr based on input parameters
|
| 65 |
+
*
|
| 66 |
+
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
|
| 67 |
+
* \p p, \p xsz, and \p ysz.
|
| 68 |
+
*
|
| 69 |
+
* \param d - Pointer to allocated memory
|
| 70 |
+
* \param p - Pitch of allocated memory in bytes
|
| 71 |
+
* \param xsz - Logical width of allocation in elements
|
| 72 |
+
* \param ysz - Logical height of allocation in elements
|
| 73 |
+
*
|
| 74 |
+
* \return
|
| 75 |
+
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
|
| 76 |
+
*
|
| 77 |
+
* \sa make_cudaExtent, make_cudaPos
|
| 78 |
+
*/
|
| 79 |
+
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
|
| 80 |
+
{
|
| 81 |
+
struct cudaPitchedPtr s;
|
| 82 |
+
|
| 83 |
+
s.ptr = d;
|
| 84 |
+
s.pitch = p;
|
| 85 |
+
s.xsize = xsz;
|
| 86 |
+
s.ysize = ysz;
|
| 87 |
+
|
| 88 |
+
return s;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
* \brief Returns a cudaPos based on input parameters
|
| 93 |
+
*
|
| 94 |
+
* Returns a ::cudaPos based on the specified input parameters \p x,
|
| 95 |
+
* \p y, and \p z.
|
| 96 |
+
*
|
| 97 |
+
* \param x - X position
|
| 98 |
+
* \param y - Y position
|
| 99 |
+
* \param z - Z position
|
| 100 |
+
*
|
| 101 |
+
* \return
|
| 102 |
+
* ::cudaPos specified by \p x, \p y, and \p z
|
| 103 |
+
*
|
| 104 |
+
* \sa make_cudaExtent, make_cudaPitchedPtr
|
| 105 |
+
*/
|
| 106 |
+
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
|
| 107 |
+
{
|
| 108 |
+
struct cudaPos p;
|
| 109 |
+
|
| 110 |
+
p.x = x;
|
| 111 |
+
p.y = y;
|
| 112 |
+
p.z = z;
|
| 113 |
+
|
| 114 |
+
return p;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
/**
|
| 118 |
+
* \brief Returns a cudaExtent based on input parameters
|
| 119 |
+
*
|
| 120 |
+
* Returns a ::cudaExtent based on the specified input parameters \p w,
|
| 121 |
+
* \p h, and \p d.
|
| 122 |
+
*
|
| 123 |
+
* \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
|
| 124 |
+
* \param h - Height in elements
|
| 125 |
+
* \param d - Depth in elements
|
| 126 |
+
*
|
| 127 |
+
* \return
|
| 128 |
+
* ::cudaExtent specified by \p w, \p h, and \p d
|
| 129 |
+
*
|
| 130 |
+
* \sa make_cudaPitchedPtr, make_cudaPos
|
| 131 |
+
*/
|
| 132 |
+
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
|
| 133 |
+
{
|
| 134 |
+
struct cudaExtent e;
|
| 135 |
+
|
| 136 |
+
e.width = w;
|
| 137 |
+
e.height = h;
|
| 138 |
+
e.depth = d;
|
| 139 |
+
|
| 140 |
+
return e;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
/** @} */ /* END CUDART_MEMORY */
|
| 144 |
+
|
| 145 |
+
#endif /* !__DRIVER_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/host_config.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__LIBRARY_TYPES_H__)
|
| 51 |
+
#define __LIBRARY_TYPES_H__
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
typedef enum cudaDataType_t
|
| 56 |
+
{
|
| 57 |
+
CUDA_R_16F = 2, /* real as a half */
|
| 58 |
+
CUDA_C_16F = 6, /* complex as a pair of half numbers */
|
| 59 |
+
CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
|
| 60 |
+
CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
|
| 61 |
+
CUDA_R_32F = 0, /* real as a float */
|
| 62 |
+
CUDA_C_32F = 4, /* complex as a pair of float numbers */
|
| 63 |
+
CUDA_R_64F = 1, /* real as a double */
|
| 64 |
+
CUDA_C_64F = 5, /* complex as a pair of double numbers */
|
| 65 |
+
CUDA_R_4I = 16, /* real as a signed 4-bit int */
|
| 66 |
+
CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */
|
| 67 |
+
CUDA_R_4U = 18, /* real as a unsigned 4-bit int */
|
| 68 |
+
CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */
|
| 69 |
+
CUDA_R_8I = 3, /* real as a signed 8-bit int */
|
| 70 |
+
CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */
|
| 71 |
+
CUDA_R_8U = 8, /* real as a unsigned 8-bit int */
|
| 72 |
+
CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */
|
| 73 |
+
CUDA_R_16I = 20, /* real as a signed 16-bit int */
|
| 74 |
+
CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */
|
| 75 |
+
CUDA_R_16U = 22, /* real as a unsigned 16-bit int */
|
| 76 |
+
CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */
|
| 77 |
+
CUDA_R_32I = 10, /* real as a signed 32-bit int */
|
| 78 |
+
CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */
|
| 79 |
+
CUDA_R_32U = 12, /* real as a unsigned 32-bit int */
|
| 80 |
+
CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */
|
| 81 |
+
CUDA_R_64I = 24, /* real as a signed 64-bit int */
|
| 82 |
+
CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */
|
| 83 |
+
CUDA_R_64U = 26, /* real as a unsigned 64-bit int */
|
| 84 |
+
CUDA_C_64U = 27, /* complex as a pair of unsigned 64-bit int numbers */
|
| 85 |
+
CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
|
| 86 |
+
CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
|
| 87 |
+
} cudaDataType;
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
typedef enum libraryPropertyType_t
|
| 91 |
+
{
|
| 92 |
+
MAJOR_VERSION,
|
| 93 |
+
MINOR_VERSION,
|
| 94 |
+
PATCH_LEVEL
|
| 95 |
+
} libraryPropertyType;
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#ifndef __cplusplus
|
| 99 |
+
typedef enum cudaDataType_t cudaDataType_t;
|
| 100 |
+
typedef enum libraryPropertyType_t libraryPropertyType_t;
|
| 101 |
+
#endif
|
| 102 |
+
|
| 103 |
+
#endif /* !__LIBRARY_TYPES_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/math_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.35.235 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
|
| 51 |
+
#define __SM_32_ATOMIC_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* !__CUDACC_RTC__ */
|
| 56 |
+
#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
/*******************************************************************************
|
| 72 |
+
* *
|
| 73 |
+
* *
|
| 74 |
+
* *
|
| 75 |
+
*******************************************************************************/
|
| 76 |
+
|
| 77 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
|
| 78 |
+
{
|
| 79 |
+
return __illAtomicMin(address, val);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
|
| 83 |
+
{
|
| 84 |
+
return __illAtomicMax(address, val);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
|
| 88 |
+
{
|
| 89 |
+
return __llAtomicAnd(address, val);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
|
| 93 |
+
{
|
| 94 |
+
return __llAtomicOr(address, val);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
|
| 98 |
+
{
|
| 99 |
+
return __llAtomicXor(address, val);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
|
| 103 |
+
{
|
| 104 |
+
return __ullAtomicMin(address, val);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
|
| 108 |
+
{
|
| 109 |
+
return __ullAtomicMax(address, val);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
|
| 113 |
+
{
|
| 114 |
+
return __ullAtomicAnd(address, val);
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
|
| 118 |
+
{
|
| 119 |
+
return __ullAtomicOr(address, val);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
|
| 123 |
+
{
|
| 124 |
+
return __ullAtomicXor(address, val);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
|
| 128 |
+
|
| 129 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 130 |
+
|
| 131 |
+
#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
|
| 132 |
+
|
| 133 |
+
#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */
|
| 134 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp
ADDED
|
@@ -0,0 +1,527 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
|
| 51 |
+
#define __SM_60_ATOMIC_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
/*******************************************************************************
|
| 72 |
+
* *
|
| 73 |
+
* *
|
| 74 |
+
* *
|
| 75 |
+
*******************************************************************************/
|
| 76 |
+
|
| 77 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
|
| 78 |
+
{
|
| 79 |
+
return __dAtomicAdd(address, val);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 83 |
+
int atomicAdd_block(int *address, int val)
|
| 84 |
+
{
|
| 85 |
+
return __iAtomicAdd_block(address, val);
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 89 |
+
int atomicAdd_system(int *address, int val)
|
| 90 |
+
{
|
| 91 |
+
return __iAtomicAdd_system(address, val);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 95 |
+
unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
|
| 96 |
+
{
|
| 97 |
+
return __uAtomicAdd_block(address, val);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 101 |
+
unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
|
| 102 |
+
{
|
| 103 |
+
return __uAtomicAdd_system(address, val);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 107 |
+
unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
|
| 108 |
+
{
|
| 109 |
+
return __ullAtomicAdd_block(address, val);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 113 |
+
unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
|
| 114 |
+
{
|
| 115 |
+
return __ullAtomicAdd_system(address, val);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 119 |
+
float atomicAdd_block(float *address, float val)
|
| 120 |
+
{
|
| 121 |
+
return __fAtomicAdd_block(address, val);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 125 |
+
float atomicAdd_system(float *address, float val)
|
| 126 |
+
{
|
| 127 |
+
return __fAtomicAdd_system(address, val);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 131 |
+
double atomicAdd_block(double *address, double val)
|
| 132 |
+
{
|
| 133 |
+
return __dAtomicAdd_block(address, val);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 137 |
+
double atomicAdd_system(double *address, double val)
|
| 138 |
+
{
|
| 139 |
+
return __dAtomicAdd_system(address, val);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 143 |
+
int atomicSub_block(int *address, int val)
|
| 144 |
+
{
|
| 145 |
+
return __iAtomicAdd_block(address, (unsigned int)-(int)val);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 149 |
+
int atomicSub_system(int *address, int val)
|
| 150 |
+
{
|
| 151 |
+
return __iAtomicAdd_system(address, (unsigned int)-(int)val);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 155 |
+
unsigned int atomicSub_block(unsigned int *address, unsigned int val)
|
| 156 |
+
{
|
| 157 |
+
return __uAtomicAdd_block(address, (unsigned int)-(int)val);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 161 |
+
unsigned int atomicSub_system(unsigned int *address, unsigned int val)
|
| 162 |
+
{
|
| 163 |
+
return __uAtomicAdd_system(address, (unsigned int)-(int)val);
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 167 |
+
int atomicExch_block(int *address, int val)
|
| 168 |
+
{
|
| 169 |
+
return __iAtomicExch_block(address, val);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 173 |
+
int atomicExch_system(int *address, int val)
|
| 174 |
+
{
|
| 175 |
+
return __iAtomicExch_system(address, val);
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 179 |
+
unsigned int atomicExch_block(unsigned int *address, unsigned int val)
|
| 180 |
+
{
|
| 181 |
+
return __uAtomicExch_block(address, val);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 185 |
+
unsigned int atomicExch_system(unsigned int *address, unsigned int val)
|
| 186 |
+
{
|
| 187 |
+
return __uAtomicExch_system(address, val);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 191 |
+
unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
|
| 192 |
+
{
|
| 193 |
+
return __ullAtomicExch_block(address, val);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 197 |
+
unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
|
| 198 |
+
{
|
| 199 |
+
return __ullAtomicExch_system(address, val);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 203 |
+
float atomicExch_block(float *address, float val)
|
| 204 |
+
{
|
| 205 |
+
return __fAtomicExch_block(address, val);
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 209 |
+
float atomicExch_system(float *address, float val)
|
| 210 |
+
{
|
| 211 |
+
return __fAtomicExch_system(address, val);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 215 |
+
int atomicMin_block(int *address, int val)
|
| 216 |
+
{
|
| 217 |
+
return __iAtomicMin_block(address, val);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 221 |
+
int atomicMin_system(int *address, int val)
|
| 222 |
+
{
|
| 223 |
+
return __iAtomicMin_system(address, val);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 227 |
+
long long atomicMin_block(long long *address, long long val)
|
| 228 |
+
{
|
| 229 |
+
return __illAtomicMin_block(address, val);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 233 |
+
long long atomicMin_system(long long *address, long long val)
|
| 234 |
+
{
|
| 235 |
+
return __illAtomicMin_system(address, val);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 239 |
+
unsigned int atomicMin_block(unsigned int *address, unsigned int val)
|
| 240 |
+
{
|
| 241 |
+
return __uAtomicMin_block(address, val);
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 245 |
+
unsigned int atomicMin_system(unsigned int *address, unsigned int val)
|
| 246 |
+
{
|
| 247 |
+
return __uAtomicMin_system(address, val);
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 251 |
+
unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
|
| 252 |
+
{
|
| 253 |
+
return __ullAtomicMin_block(address, val);
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 257 |
+
unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
|
| 258 |
+
{
|
| 259 |
+
return __ullAtomicMin_system(address, val);
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 263 |
+
int atomicMax_block(int *address, int val)
|
| 264 |
+
{
|
| 265 |
+
return __iAtomicMax_block(address, val);
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 269 |
+
int atomicMax_system(int *address, int val)
|
| 270 |
+
{
|
| 271 |
+
return __iAtomicMax_system(address, val);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 275 |
+
long long atomicMax_block(long long *address, long long val)
|
| 276 |
+
{
|
| 277 |
+
return __illAtomicMax_block(address, val);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 281 |
+
long long atomicMax_system(long long *address, long long val)
|
| 282 |
+
{
|
| 283 |
+
return __illAtomicMax_system(address, val);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 287 |
+
unsigned int atomicMax_block(unsigned int *address, unsigned int val)
|
| 288 |
+
{
|
| 289 |
+
return __uAtomicMax_block(address, val);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 293 |
+
unsigned int atomicMax_system(unsigned int *address, unsigned int val)
|
| 294 |
+
{
|
| 295 |
+
return __uAtomicMax_system(address, val);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 299 |
+
unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
|
| 300 |
+
{
|
| 301 |
+
return __ullAtomicMax_block(address, val);
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 305 |
+
unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
|
| 306 |
+
{
|
| 307 |
+
return __ullAtomicMax_system(address, val);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 311 |
+
unsigned int atomicInc_block(unsigned int *address, unsigned int val)
|
| 312 |
+
{
|
| 313 |
+
return __uAtomicInc_block(address, val);
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 317 |
+
unsigned int atomicInc_system(unsigned int *address, unsigned int val)
|
| 318 |
+
{
|
| 319 |
+
return __uAtomicInc_system(address, val);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 323 |
+
unsigned int atomicDec_block(unsigned int *address, unsigned int val)
|
| 324 |
+
{
|
| 325 |
+
return __uAtomicDec_block(address, val);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 329 |
+
unsigned int atomicDec_system(unsigned int *address, unsigned int val)
|
| 330 |
+
{
|
| 331 |
+
return __uAtomicDec_system(address, val);
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 335 |
+
int atomicCAS_block(int *address, int compare, int val)
|
| 336 |
+
{
|
| 337 |
+
return __iAtomicCAS_block(address, compare, val);
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 341 |
+
int atomicCAS_system(int *address, int compare, int val)
|
| 342 |
+
{
|
| 343 |
+
return __iAtomicCAS_system(address, compare, val);
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 347 |
+
unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
|
| 348 |
+
unsigned int val)
|
| 349 |
+
{
|
| 350 |
+
return __uAtomicCAS_block(address, compare, val);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 354 |
+
unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
|
| 355 |
+
unsigned int val)
|
| 356 |
+
{
|
| 357 |
+
return __uAtomicCAS_system(address, compare, val);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 361 |
+
unsigned long long int atomicCAS_block(unsigned long long int *address,
|
| 362 |
+
unsigned long long int compare,
|
| 363 |
+
unsigned long long int val)
|
| 364 |
+
{
|
| 365 |
+
return __ullAtomicCAS_block(address, compare, val);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 369 |
+
unsigned long long int atomicCAS_system(unsigned long long int *address,
|
| 370 |
+
unsigned long long int compare,
|
| 371 |
+
unsigned long long int val)
|
| 372 |
+
{
|
| 373 |
+
return __ullAtomicCAS_system(address, compare, val);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 377 |
+
int atomicAnd_block(int *address, int val)
|
| 378 |
+
{
|
| 379 |
+
return __iAtomicAnd_block(address, val);
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 383 |
+
int atomicAnd_system(int *address, int val)
|
| 384 |
+
{
|
| 385 |
+
return __iAtomicAnd_system(address, val);
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 389 |
+
long long atomicAnd_block(long long *address, long long val)
|
| 390 |
+
{
|
| 391 |
+
return __llAtomicAnd_block(address, val);
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 395 |
+
long long atomicAnd_system(long long *address, long long val)
|
| 396 |
+
{
|
| 397 |
+
return __llAtomicAnd_system(address, val);
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 401 |
+
unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
|
| 402 |
+
{
|
| 403 |
+
return __uAtomicAnd_block(address, val);
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 407 |
+
unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
|
| 408 |
+
{
|
| 409 |
+
return __uAtomicAnd_system(address, val);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 413 |
+
unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
|
| 414 |
+
{
|
| 415 |
+
return __ullAtomicAnd_block(address, val);
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 419 |
+
unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
|
| 420 |
+
{
|
| 421 |
+
return __ullAtomicAnd_system(address, val);
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 425 |
+
int atomicOr_block(int *address, int val)
|
| 426 |
+
{
|
| 427 |
+
return __iAtomicOr_block(address, val);
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 431 |
+
int atomicOr_system(int *address, int val)
|
| 432 |
+
{
|
| 433 |
+
return __iAtomicOr_system(address, val);
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 437 |
+
long long atomicOr_block(long long *address, long long val)
|
| 438 |
+
{
|
| 439 |
+
return __llAtomicOr_block(address, val);
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 443 |
+
long long atomicOr_system(long long *address, long long val)
|
| 444 |
+
{
|
| 445 |
+
return __llAtomicOr_system(address, val);
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 449 |
+
unsigned int atomicOr_block(unsigned int *address, unsigned int val)
|
| 450 |
+
{
|
| 451 |
+
return __uAtomicOr_block(address, val);
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 455 |
+
unsigned int atomicOr_system(unsigned int *address, unsigned int val)
|
| 456 |
+
{
|
| 457 |
+
return __uAtomicOr_system(address, val);
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 461 |
+
unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
|
| 462 |
+
{
|
| 463 |
+
return __ullAtomicOr_block(address, val);
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 467 |
+
unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
|
| 468 |
+
{
|
| 469 |
+
return __ullAtomicOr_system(address, val);
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 473 |
+
int atomicXor_block(int *address, int val)
|
| 474 |
+
{
|
| 475 |
+
return __iAtomicXor_block(address, val);
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 479 |
+
int atomicXor_system(int *address, int val)
|
| 480 |
+
{
|
| 481 |
+
return __iAtomicXor_system(address, val);
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 485 |
+
long long atomicXor_block(long long *address, long long val)
|
| 486 |
+
{
|
| 487 |
+
return __llAtomicXor_block(address, val);
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 491 |
+
long long atomicXor_system(long long *address, long long val)
|
| 492 |
+
{
|
| 493 |
+
return __llAtomicXor_system(address, val);
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 497 |
+
unsigned int atomicXor_block(unsigned int *address, unsigned int val)
|
| 498 |
+
{
|
| 499 |
+
return __uAtomicXor_block(address, val);
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 503 |
+
unsigned int atomicXor_system(unsigned int *address, unsigned int val)
|
| 504 |
+
{
|
| 505 |
+
return __uAtomicXor_system(address, val);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 509 |
+
unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
|
| 510 |
+
{
|
| 511 |
+
return __ullAtomicXor_block(address, val);
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
__SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 515 |
+
unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
|
| 516 |
+
{
|
| 517 |
+
return __ullAtomicXor_system(address, val);
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
|
| 521 |
+
|
| 522 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 523 |
+
|
| 524 |
+
#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
|
| 525 |
+
|
| 526 |
+
#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
|
| 527 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SURFACE_FUNCTIONS_H__)
|
| 51 |
+
#define __SURFACE_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#include "cuda_runtime_api.h"
|
| 62 |
+
#include "cuda_surface_types.h"
|
| 63 |
+
|
| 64 |
+
#if defined(_WIN32)
|
| 65 |
+
# define __DEPRECATED__ __declspec(deprecated)
|
| 66 |
+
#else
|
| 67 |
+
# define __DEPRECATED__ __attribute__((deprecated))
|
| 68 |
+
#endif
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
#ifdef __CUDA_ARCH__
|
| 73 |
+
template <typename T> struct __nv_surf_trait { typedef void * cast_type; };
|
| 74 |
+
|
| 75 |
+
template<> struct __nv_surf_trait<char> { typedef char * cast_type; };
|
| 76 |
+
template<> struct __nv_surf_trait<signed char> { typedef signed char * cast_type; };
|
| 77 |
+
template<> struct __nv_surf_trait<unsigned char> { typedef unsigned char * cast_type; };
|
| 78 |
+
template<> struct __nv_surf_trait<char1> { typedef char1 * cast_type; };
|
| 79 |
+
template<> struct __nv_surf_trait<uchar1> { typedef uchar1 * cast_type; };
|
| 80 |
+
template<> struct __nv_surf_trait<char2> { typedef char2 * cast_type; };
|
| 81 |
+
template<> struct __nv_surf_trait<uchar2> { typedef uchar2 * cast_type; };
|
| 82 |
+
template<> struct __nv_surf_trait<char4> { typedef char4 * cast_type; };
|
| 83 |
+
template<> struct __nv_surf_trait<uchar4> { typedef uchar4 * cast_type; };
|
| 84 |
+
template<> struct __nv_surf_trait<short> { typedef short * cast_type; };
|
| 85 |
+
template<> struct __nv_surf_trait<unsigned short> { typedef unsigned short * cast_type; };
|
| 86 |
+
template<> struct __nv_surf_trait<short1> { typedef short1 * cast_type; };
|
| 87 |
+
template<> struct __nv_surf_trait<ushort1> { typedef ushort1 * cast_type; };
|
| 88 |
+
template<> struct __nv_surf_trait<short2> { typedef short2 * cast_type; };
|
| 89 |
+
template<> struct __nv_surf_trait<ushort2> { typedef ushort2 * cast_type; };
|
| 90 |
+
template<> struct __nv_surf_trait<short4> { typedef short4 * cast_type; };
|
| 91 |
+
template<> struct __nv_surf_trait<ushort4> { typedef ushort4 * cast_type; };
|
| 92 |
+
template<> struct __nv_surf_trait<int> { typedef int * cast_type; };
|
| 93 |
+
template<> struct __nv_surf_trait<unsigned int> { typedef unsigned int * cast_type; };
|
| 94 |
+
template<> struct __nv_surf_trait<int1> { typedef int1 * cast_type; };
|
| 95 |
+
template<> struct __nv_surf_trait<uint1> { typedef uint1 * cast_type; };
|
| 96 |
+
template<> struct __nv_surf_trait<int2> { typedef int2 * cast_type; };
|
| 97 |
+
template<> struct __nv_surf_trait<uint2> { typedef uint2 * cast_type; };
|
| 98 |
+
template<> struct __nv_surf_trait<int4> { typedef int4 * cast_type; };
|
| 99 |
+
template<> struct __nv_surf_trait<uint4> { typedef uint4 * cast_type; };
|
| 100 |
+
template<> struct __nv_surf_trait<long long> { typedef long long * cast_type; };
|
| 101 |
+
template<> struct __nv_surf_trait<unsigned long long> { typedef unsigned long long * cast_type; };
|
| 102 |
+
template<> struct __nv_surf_trait<longlong1> { typedef longlong1 * cast_type; };
|
| 103 |
+
template<> struct __nv_surf_trait<ulonglong1> { typedef ulonglong1 * cast_type; };
|
| 104 |
+
template<> struct __nv_surf_trait<longlong2> { typedef longlong2 * cast_type; };
|
| 105 |
+
template<> struct __nv_surf_trait<ulonglong2> { typedef ulonglong2 * cast_type; };
|
| 106 |
+
#if !defined(__LP64__)
|
| 107 |
+
template<> struct __nv_surf_trait<long> { typedef int * cast_type; };
|
| 108 |
+
template<> struct __nv_surf_trait<unsigned long> { typedef unsigned int * cast_type; };
|
| 109 |
+
template<> struct __nv_surf_trait<long1> { typedef int1 * cast_type; };
|
| 110 |
+
template<> struct __nv_surf_trait<ulong1> { typedef uint1 * cast_type; };
|
| 111 |
+
template<> struct __nv_surf_trait<long2> { typedef int2 * cast_type; };
|
| 112 |
+
template<> struct __nv_surf_trait<ulong2> { typedef uint2 * cast_type; };
|
| 113 |
+
template<> struct __nv_surf_trait<long4> { typedef uint4 * cast_type; };
|
| 114 |
+
template<> struct __nv_surf_trait<ulong4> { typedef int4 * cast_type; };
|
| 115 |
+
#endif
|
| 116 |
+
template<> struct __nv_surf_trait<float> { typedef float * cast_type; };
|
| 117 |
+
template<> struct __nv_surf_trait<float1> { typedef float1 * cast_type; };
|
| 118 |
+
template<> struct __nv_surf_trait<float2> { typedef float2 * cast_type; };
|
| 119 |
+
template<> struct __nv_surf_trait<float4> { typedef float4 * cast_type; };
|
| 120 |
+
#endif /* defined(__CUDA_ARCH__) */
|
| 121 |
+
|
| 122 |
+
template <typename T>
|
| 123 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 124 |
+
{
|
| 125 |
+
#ifdef __CUDA_ARCH__
|
| 126 |
+
__nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode);
|
| 127 |
+
#endif
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
template<class T>
|
| 131 |
+
static __DEPRECATED__ __device__ __forceinline__ T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 132 |
+
{
|
| 133 |
+
#ifdef __CUDA_ARCH__
|
| 134 |
+
T temp;
|
| 135 |
+
__nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode);
|
| 136 |
+
return temp;
|
| 137 |
+
#endif
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
template<class T>
|
| 141 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 142 |
+
{
|
| 143 |
+
#ifdef __CUDA_ARCH__
|
| 144 |
+
*res = surf1Dread<T>(surf, x, mode);
|
| 145 |
+
#endif /* __CUDA_ARCH__ */
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
template <typename T>
|
| 150 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 151 |
+
{
|
| 152 |
+
#ifdef __CUDA_ARCH__
|
| 153 |
+
__nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode);
|
| 154 |
+
#endif
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
template<class T>
|
| 158 |
+
static __DEPRECATED__ __device__ __forceinline__ T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 159 |
+
{
|
| 160 |
+
#ifdef __CUDA_ARCH__
|
| 161 |
+
T temp;
|
| 162 |
+
__nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode);
|
| 163 |
+
return temp;
|
| 164 |
+
#endif
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
template<class T>
|
| 168 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 169 |
+
{
|
| 170 |
+
#ifdef __CUDA_ARCH__
|
| 171 |
+
*res = surf2Dread<T>(surf, x, y, mode);
|
| 172 |
+
#endif /* __CUDA_ARCH__ */
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
template <typename T>
|
| 177 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 178 |
+
{
|
| 179 |
+
#ifdef __CUDA_ARCH__
|
| 180 |
+
__nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode);
|
| 181 |
+
#endif
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
template<class T>
|
| 185 |
+
static __DEPRECATED__ __device__ __forceinline__ T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 186 |
+
{
|
| 187 |
+
#ifdef __CUDA_ARCH__
|
| 188 |
+
T temp;
|
| 189 |
+
__nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode);
|
| 190 |
+
return temp;
|
| 191 |
+
#endif
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
template<class T>
|
| 195 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 196 |
+
{
|
| 197 |
+
#ifdef __CUDA_ARCH__
|
| 198 |
+
*res = surf3Dread<T>(surf, x, y, z, mode);
|
| 199 |
+
#endif /* __CUDA_ARCH__ */
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
template <typename T>
|
| 205 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 206 |
+
{
|
| 207 |
+
#ifdef __CUDA_ARCH__
|
| 208 |
+
__nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x, layer, mode);
|
| 209 |
+
#endif
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
template<class T>
|
| 213 |
+
static __DEPRECATED__ __device__ __forceinline__ T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 214 |
+
{
|
| 215 |
+
#ifdef __CUDA_ARCH__
|
| 216 |
+
T temp;
|
| 217 |
+
__nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode);
|
| 218 |
+
return temp;
|
| 219 |
+
#endif
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
template<class T>
|
| 224 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 225 |
+
{
|
| 226 |
+
#ifdef __CUDA_ARCH__
|
| 227 |
+
*res = surf1DLayeredread<T>(surf, x, layer, mode);
|
| 228 |
+
#endif /* __CUDA_ARCH__ */
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
template <typename T>
|
| 233 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 234 |
+
{
|
| 235 |
+
#ifdef __CUDA_ARCH__
|
| 236 |
+
__nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode);
|
| 237 |
+
#endif
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
template<class T>
|
| 241 |
+
static __DEPRECATED__ __device__ __forceinline__ T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 242 |
+
{
|
| 243 |
+
#ifdef __CUDA_ARCH__
|
| 244 |
+
T temp;
|
| 245 |
+
__nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode);
|
| 246 |
+
return temp;
|
| 247 |
+
#endif
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
template<class T>
|
| 252 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 253 |
+
{
|
| 254 |
+
#ifdef __CUDA_ARCH__
|
| 255 |
+
*res = surf2DLayeredread<T>(surf, x, y, layer, mode);
|
| 256 |
+
#endif /* __CUDA_ARCH__ */
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
template <typename T>
|
| 261 |
+
static __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 262 |
+
{
|
| 263 |
+
#ifdef __CUDA_ARCH__
|
| 264 |
+
__nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode);
|
| 265 |
+
#endif
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
template<class T>
|
| 269 |
+
static __DEPRECATED__ __device__ __forceinline__ T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 270 |
+
{
|
| 271 |
+
#ifdef __CUDA_ARCH__
|
| 272 |
+
T temp;
|
| 273 |
+
|
| 274 |
+
__nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode);
|
| 275 |
+
return temp;
|
| 276 |
+
#endif
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
template<class T>
|
| 280 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 281 |
+
{
|
| 282 |
+
#ifdef __CUDA_ARCH__
|
| 283 |
+
*res = surfCubemapread<T>(surf, x, y, face, mode);
|
| 284 |
+
#endif /* __CUDA_ARCH__ */
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
template <typename T>
|
| 289 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 290 |
+
{
|
| 291 |
+
#ifdef __CUDA_ARCH__
|
| 292 |
+
__nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode);
|
| 293 |
+
#endif
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
template<class T>
|
| 297 |
+
static __DEPRECATED__ __device__ __forceinline__ T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 298 |
+
{
|
| 299 |
+
#ifdef __CUDA_ARCH__
|
| 300 |
+
T temp;
|
| 301 |
+
__nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode);
|
| 302 |
+
return temp;
|
| 303 |
+
#endif
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
template<class T>
|
| 307 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 308 |
+
{
|
| 309 |
+
#ifdef __CUDA_ARCH__
|
| 310 |
+
*res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode);
|
| 311 |
+
#endif /* __CUDA_ARCH__ */
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
//surf1Dwrite
|
| 315 |
+
template<class T>
|
| 316 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 317 |
+
{
|
| 318 |
+
#ifdef __CUDA_ARCH__
|
| 319 |
+
__nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode);
|
| 320 |
+
#endif
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
template<class T>
|
| 324 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 325 |
+
{
|
| 326 |
+
#ifdef __CUDA_ARCH__
|
| 327 |
+
__nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, mode);
|
| 328 |
+
#endif /* __CUDA_ARCH__ */
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
//surf2Dwrite
|
| 333 |
+
template<class T>
|
| 334 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 335 |
+
{
|
| 336 |
+
#ifdef __CUDA_ARCH__
|
| 337 |
+
__nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val, s, surf, x, y, mode);
|
| 338 |
+
#endif
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
template<class T>
|
| 342 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 343 |
+
{
|
| 344 |
+
#ifdef __CUDA_ARCH__
|
| 345 |
+
__nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, mode);
|
| 346 |
+
#endif /* __CUDA_ARCH__ */
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
//surf3Dwrite
|
| 350 |
+
template<class T>
|
| 351 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 352 |
+
{
|
| 353 |
+
#ifdef __CUDA_ARCH__
|
| 354 |
+
__nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val, s, surf, x, y, z,mode);
|
| 355 |
+
#endif
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
template<class T>
|
| 359 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 360 |
+
{
|
| 361 |
+
#ifdef __CUDA_ARCH__
|
| 362 |
+
__nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z, mode);
|
| 363 |
+
#endif /* __CUDA_ARCH__ */
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
//surf1DLayeredwrite
|
| 367 |
+
template<class T>
|
| 368 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 369 |
+
{
|
| 370 |
+
#ifdef __CUDA_ARCH__
|
| 371 |
+
__nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val, s, surf, x, layer,mode);
|
| 372 |
+
#endif
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
template<class T>
|
| 376 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 377 |
+
{
|
| 378 |
+
#ifdef __CUDA_ARCH__
|
| 379 |
+
__nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, layer, mode);
|
| 380 |
+
#endif /* __CUDA_ARCH__ */
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
//surf2DLayeredwrite
|
| 384 |
+
template<class T>
|
| 385 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 386 |
+
{
|
| 387 |
+
#ifdef __CUDA_ARCH__
|
| 388 |
+
__nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode);
|
| 389 |
+
#endif
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
template<class T>
|
| 393 |
+
static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 394 |
+
{
|
| 395 |
+
#ifdef __CUDA_ARCH__
|
| 396 |
+
__nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layer, mode);
|
| 397 |
+
#endif /* __CUDA_ARCH__ */
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
//surfCubemapwrite
|
| 401 |
+
template<class T>
|
| 402 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 403 |
+
{
|
| 404 |
+
#ifdef __CUDA_ARCH__
|
| 405 |
+
__nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode);
|
| 406 |
+
#endif
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
template<class T>
|
| 410 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 411 |
+
{
|
| 412 |
+
#ifdef __CUDA_ARCH__
|
| 413 |
+
__nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face, mode);
|
| 414 |
+
#endif /* __CUDA_ARCH__ */
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
//surfCubemapLayeredwrite
|
| 419 |
+
template<class T>
|
| 420 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 421 |
+
{
|
| 422 |
+
#ifdef __CUDA_ARCH__
|
| 423 |
+
__nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace, mode);
|
| 424 |
+
#endif
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
template<class T>
|
| 428 |
+
static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 429 |
+
{
|
| 430 |
+
#ifdef __CUDA_ARCH__
|
| 431 |
+
__nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace, mode);
|
| 432 |
+
#endif /* __CUDA_ARCH__ */
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
#undef __DEPRECATED__
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 439 |
+
#endif /* !__SURFACE_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h
ADDED
|
@@ -0,0 +1,739 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
|
| 51 |
+
#define __TEXTURE_FETCH_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 55 |
+
|
| 56 |
+
/*******************************************************************************
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
* *
|
| 60 |
+
*******************************************************************************/
|
| 61 |
+
|
| 62 |
+
#include "cuda_runtime_api.h"
|
| 63 |
+
#include "cuda_texture_types.h"
|
| 64 |
+
|
| 65 |
+
#if defined(_WIN32)
|
| 66 |
+
# define __DEPRECATED__ __declspec(deprecated)
|
| 67 |
+
#else
|
| 68 |
+
# define __DEPRECATED__ __attribute__((deprecated))
|
| 69 |
+
#endif
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
template <typename T>
|
| 73 |
+
struct __nv_tex_rmet_ret { };
|
| 74 |
+
|
| 75 |
+
template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
|
| 76 |
+
template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
|
| 77 |
+
template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
|
| 78 |
+
template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
|
| 79 |
+
template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
|
| 80 |
+
template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
|
| 81 |
+
template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
|
| 82 |
+
template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
|
| 83 |
+
template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
|
| 84 |
+
|
| 85 |
+
template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
|
| 86 |
+
template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
|
| 87 |
+
template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
|
| 88 |
+
template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
|
| 89 |
+
template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
|
| 90 |
+
template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
|
| 91 |
+
template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
|
| 92 |
+
template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
|
| 93 |
+
|
| 94 |
+
template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
|
| 95 |
+
template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
|
| 96 |
+
template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
|
| 97 |
+
template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
|
| 98 |
+
template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
|
| 99 |
+
template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
|
| 100 |
+
template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
|
| 101 |
+
template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
|
| 102 |
+
|
| 103 |
+
#if !defined(__LP64__)
|
| 104 |
+
template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
|
| 105 |
+
template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
|
| 106 |
+
template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
|
| 107 |
+
template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
|
| 108 |
+
template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
|
| 109 |
+
template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
|
| 110 |
+
template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
|
| 111 |
+
template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
|
| 112 |
+
#endif /* !__LP64__ */
|
| 113 |
+
template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
|
| 114 |
+
template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
|
| 115 |
+
template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
|
| 116 |
+
template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
template <typename T> struct __nv_tex_rmet_cast { typedef T* type; };
|
| 120 |
+
#if !defined(__LP64__)
|
| 121 |
+
template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
|
| 122 |
+
template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
|
| 123 |
+
template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
|
| 124 |
+
template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
|
| 125 |
+
template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
|
| 126 |
+
template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
|
| 127 |
+
template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
|
| 128 |
+
template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
|
| 129 |
+
#endif /* !__LP64__ */
|
| 130 |
+
|
| 131 |
+
template <typename T>
|
| 132 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x)
|
| 133 |
+
{
|
| 134 |
+
#ifdef __CUDA_ARCH__
|
| 135 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 136 |
+
__nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x);
|
| 137 |
+
return temp;
|
| 138 |
+
#endif
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
template <typename T>
|
| 142 |
+
struct __nv_tex_rmnf_ret { };
|
| 143 |
+
|
| 144 |
+
template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
|
| 145 |
+
template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
|
| 146 |
+
template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
|
| 147 |
+
template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
|
| 148 |
+
template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
|
| 149 |
+
template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
|
| 150 |
+
template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
|
| 151 |
+
template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
|
| 152 |
+
template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
|
| 153 |
+
template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
|
| 154 |
+
template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
|
| 155 |
+
template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
|
| 156 |
+
template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
|
| 157 |
+
template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
|
| 158 |
+
template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
|
| 159 |
+
template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
|
| 160 |
+
template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
|
| 161 |
+
|
| 162 |
+
template <typename T>
|
| 163 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x)
|
| 164 |
+
{
|
| 165 |
+
#ifdef __CUDA_ARCH__
|
| 166 |
+
T type_dummy;
|
| 167 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 168 |
+
__nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x);
|
| 169 |
+
return retval;
|
| 170 |
+
#endif /* __CUDA_ARCH__ */
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
// tex1D
|
| 174 |
+
template <typename T>
|
| 175 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x)
|
| 176 |
+
{
|
| 177 |
+
#ifdef __CUDA_ARCH__
|
| 178 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 179 |
+
__nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x);
|
| 180 |
+
return temp;
|
| 181 |
+
#endif
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
template <typename T>
|
| 185 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x)
|
| 186 |
+
{
|
| 187 |
+
#ifdef __CUDA_ARCH__
|
| 188 |
+
T type_dummy;
|
| 189 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 190 |
+
__nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x);
|
| 191 |
+
return retval;
|
| 192 |
+
#endif /* __CUDA_ARCH__ */
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
//tex2D
|
| 197 |
+
template <typename T>
|
| 198 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y)
|
| 199 |
+
{
|
| 200 |
+
#ifdef __CUDA_ARCH__
|
| 201 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 202 |
+
|
| 203 |
+
__nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y);
|
| 204 |
+
return temp;
|
| 205 |
+
#endif
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
template <typename T>
|
| 209 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y)
|
| 210 |
+
{
|
| 211 |
+
#ifdef __CUDA_ARCH__
|
| 212 |
+
T type_dummy;
|
| 213 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 214 |
+
__nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y);
|
| 215 |
+
return retval;
|
| 216 |
+
#endif /* __CUDA_ARCH__ */
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
//tex1DLayered
|
| 221 |
+
template <typename T>
|
| 222 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer)
|
| 223 |
+
{
|
| 224 |
+
#ifdef __CUDA_ARCH__
|
| 225 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 226 |
+
__nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer);
|
| 227 |
+
return temp;
|
| 228 |
+
#endif
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
template <typename T>
|
| 232 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer)
|
| 233 |
+
{
|
| 234 |
+
#ifdef __CUDA_ARCH__
|
| 235 |
+
T type_dummy;
|
| 236 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 237 |
+
__nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer);
|
| 238 |
+
return retval;
|
| 239 |
+
#endif /* __CUDA_ARCH__ */
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
//tex2DLayered
|
| 244 |
+
template <typename T>
|
| 245 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer)
|
| 246 |
+
{
|
| 247 |
+
#ifdef __CUDA_ARCH__
|
| 248 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 249 |
+
__nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer);
|
| 250 |
+
return temp;
|
| 251 |
+
#endif
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
template <typename T>
|
| 255 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer)
|
| 256 |
+
{
|
| 257 |
+
#ifdef __CUDA_ARCH__
|
| 258 |
+
T type_dummy;
|
| 259 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 260 |
+
__nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer);
|
| 261 |
+
return retval;
|
| 262 |
+
#endif /* __CUDA_ARCH__ */
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
// tex3D
|
| 266 |
+
template <typename T>
|
| 267 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z)
|
| 268 |
+
{
|
| 269 |
+
#ifdef __CUDA_ARCH__
|
| 270 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 271 |
+
__nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
|
| 272 |
+
return temp;
|
| 273 |
+
#endif
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
template <typename T>
|
| 277 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z)
|
| 278 |
+
{
|
| 279 |
+
#ifdef __CUDA_ARCH__
|
| 280 |
+
T type_dummy;
|
| 281 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 282 |
+
__nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z);
|
| 283 |
+
return retval;
|
| 284 |
+
#endif /* __CUDA_ARCH__ */
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
// texCubemap
|
| 288 |
+
template <typename T>
|
| 289 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z)
|
| 290 |
+
{
|
| 291 |
+
#ifdef __CUDA_ARCH__
|
| 292 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 293 |
+
__nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
|
| 294 |
+
return temp;
|
| 295 |
+
#endif
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
template <typename T>
|
| 299 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z)
|
| 300 |
+
{
|
| 301 |
+
#ifdef __CUDA_ARCH__
|
| 302 |
+
T type_dummy;
|
| 303 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 304 |
+
__nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z);
|
| 305 |
+
return retval;
|
| 306 |
+
#endif /* __CUDA_ARCH__ */
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
template <typename T>
|
| 311 |
+
struct __nv_tex2dgather_ret { };
|
| 312 |
+
template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
|
| 313 |
+
template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
|
| 314 |
+
template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
|
| 315 |
+
template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
|
| 316 |
+
template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
|
| 317 |
+
template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
|
| 318 |
+
template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
|
| 319 |
+
template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
|
| 320 |
+
template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
|
| 321 |
+
template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
|
| 322 |
+
template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
|
| 323 |
+
|
| 324 |
+
template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
|
| 325 |
+
template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
|
| 326 |
+
template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
|
| 327 |
+
template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
|
| 328 |
+
template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
|
| 329 |
+
template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
|
| 330 |
+
template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
|
| 331 |
+
template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
|
| 332 |
+
template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
|
| 333 |
+
template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
|
| 334 |
+
|
| 335 |
+
template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
|
| 336 |
+
template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
|
| 337 |
+
template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
|
| 338 |
+
template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
|
| 339 |
+
template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
|
| 340 |
+
template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
|
| 341 |
+
template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
|
| 342 |
+
template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
|
| 343 |
+
template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
|
| 344 |
+
template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
|
| 345 |
+
|
| 346 |
+
template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
|
| 347 |
+
template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
|
| 348 |
+
template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
|
| 349 |
+
template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
|
| 350 |
+
template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
|
| 351 |
+
|
| 352 |
+
template <typename T>
|
| 353 |
+
static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0)
|
| 354 |
+
{
|
| 355 |
+
#ifdef __CUDA_ARCH__
|
| 356 |
+
T type_dummy;
|
| 357 |
+
typename __nv_tex2dgather_ret<T>::type retval;
|
| 358 |
+
__nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp);
|
| 359 |
+
return retval;
|
| 360 |
+
#endif /* __CUDA_ARCH__ */
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
template<typename T> struct __nv_tex2dgather_rmnf_ret { };
|
| 365 |
+
template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
|
| 366 |
+
template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
|
| 367 |
+
template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
|
| 368 |
+
template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
|
| 369 |
+
template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
|
| 370 |
+
template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
|
| 371 |
+
template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
|
| 372 |
+
template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
|
| 373 |
+
template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
|
| 374 |
+
template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
|
| 375 |
+
template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
|
| 376 |
+
template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
|
| 377 |
+
template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
|
| 378 |
+
template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
|
| 379 |
+
template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
|
| 380 |
+
template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
|
| 381 |
+
template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
|
| 382 |
+
template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
|
| 383 |
+
template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
|
| 384 |
+
template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
|
| 385 |
+
template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
|
| 386 |
+
|
| 387 |
+
template <typename T>
|
| 388 |
+
static __device__ __forceinline__ typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0)
|
| 389 |
+
{
|
| 390 |
+
#ifdef __CUDA_ARCH__
|
| 391 |
+
T type_dummy;
|
| 392 |
+
typename __nv_tex2dgather_rmnf_ret<T>::type retval;
|
| 393 |
+
__nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp);
|
| 394 |
+
return retval;
|
| 395 |
+
#endif /* __CUDA_ARCH__ */
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
// tex1DLod
|
| 400 |
+
template <typename T>
|
| 401 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level)
|
| 402 |
+
{
|
| 403 |
+
#ifdef __CUDA_ARCH__
|
| 404 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 405 |
+
__nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level);
|
| 406 |
+
return temp;
|
| 407 |
+
#endif
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
template <typename T>
|
| 411 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level)
|
| 412 |
+
{
|
| 413 |
+
#ifdef __CUDA_ARCH__
|
| 414 |
+
T type_dummy;
|
| 415 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 416 |
+
__nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level);
|
| 417 |
+
return retval;
|
| 418 |
+
#endif /* __CUDA_ARCH__ */
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
// tex2DLod
|
| 422 |
+
template <typename T>
|
| 423 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level)
|
| 424 |
+
{
|
| 425 |
+
#ifdef __CUDA_ARCH__
|
| 426 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 427 |
+
__nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level);
|
| 428 |
+
return temp;
|
| 429 |
+
#endif
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
template <typename T>
|
| 433 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level)
|
| 434 |
+
{
|
| 435 |
+
#ifdef __CUDA_ARCH__
|
| 436 |
+
T type_dummy;
|
| 437 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 438 |
+
__nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level);
|
| 439 |
+
return retval;
|
| 440 |
+
#endif /* __CUDA_ARCH__ */
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
// tex1DLayeredLod
|
| 444 |
+
template <typename T>
|
| 445 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level)
|
| 446 |
+
{
|
| 447 |
+
#ifdef __CUDA_ARCH__
|
| 448 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 449 |
+
__nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level);
|
| 450 |
+
return temp;
|
| 451 |
+
#endif
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
template <typename T>
|
| 455 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level)
|
| 456 |
+
{
|
| 457 |
+
#ifdef __CUDA_ARCH__
|
| 458 |
+
T type_dummy;
|
| 459 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 460 |
+
__nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level);
|
| 461 |
+
return retval;
|
| 462 |
+
#endif /* __CUDA_ARCH__ */
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
// tex2DLayeredLod
|
| 466 |
+
template <typename T>
|
| 467 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level)
|
| 468 |
+
{
|
| 469 |
+
#ifdef __CUDA_ARCH__
|
| 470 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 471 |
+
__nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level);
|
| 472 |
+
return temp;
|
| 473 |
+
#endif
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
template <typename T>
|
| 477 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level)
|
| 478 |
+
{
|
| 479 |
+
#ifdef __CUDA_ARCH__
|
| 480 |
+
T type_dummy;
|
| 481 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 482 |
+
__nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level);
|
| 483 |
+
return retval;
|
| 484 |
+
#endif /* __CUDA_ARCH__ */
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
// tex3DLod
|
| 488 |
+
template <typename T>
|
| 489 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level)
|
| 490 |
+
{
|
| 491 |
+
#ifdef __CUDA_ARCH__
|
| 492 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 493 |
+
__nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
|
| 494 |
+
return temp;
|
| 495 |
+
#endif
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
template <typename T>
|
| 499 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
|
| 500 |
+
{
|
| 501 |
+
#ifdef __CUDA_ARCH__
|
| 502 |
+
T type_dummy;
|
| 503 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 504 |
+
__nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
|
| 505 |
+
return retval;
|
| 506 |
+
#endif /* __CUDA_ARCH__ */
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
// texCubemapLod
|
| 510 |
+
template <typename T>
|
| 511 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level)
|
| 512 |
+
{
|
| 513 |
+
#ifdef __CUDA_ARCH__
|
| 514 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 515 |
+
__nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
|
| 516 |
+
return temp;
|
| 517 |
+
#endif
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
template <typename T>
|
| 521 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
|
| 522 |
+
{
|
| 523 |
+
#ifdef __CUDA_ARCH__
|
| 524 |
+
T type_dummy;
|
| 525 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 526 |
+
__nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
|
| 527 |
+
return retval;
|
| 528 |
+
#endif /* __CUDA_ARCH__ */
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
// texCubemapLayered
|
| 533 |
+
template <typename T>
|
| 534 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer)
|
| 535 |
+
{
|
| 536 |
+
#ifdef __CUDA_ARCH__
|
| 537 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 538 |
+
__nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer);
|
| 539 |
+
return temp;
|
| 540 |
+
#endif
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
template <typename T>
|
| 544 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer)
|
| 545 |
+
{
|
| 546 |
+
#ifdef __CUDA_ARCH__
|
| 547 |
+
T type_dummy;
|
| 548 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 549 |
+
__nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer);
|
| 550 |
+
return retval;
|
| 551 |
+
#endif /* __CUDA_ARCH__ */
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
// texCubemapLayeredLod
|
| 556 |
+
template <typename T>
|
| 557 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level)
|
| 558 |
+
{
|
| 559 |
+
#ifdef __CUDA_ARCH__
|
| 560 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 561 |
+
__nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level);
|
| 562 |
+
return temp;
|
| 563 |
+
#endif
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
template <typename T>
|
| 567 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level)
|
| 568 |
+
{
|
| 569 |
+
#ifdef __CUDA_ARCH__
|
| 570 |
+
T type_dummy;
|
| 571 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 572 |
+
__nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level);
|
| 573 |
+
return retval;
|
| 574 |
+
#endif /* __CUDA_ARCH__ */
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
// texCubemapGrad
|
| 579 |
+
template <typename T>
|
| 580 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 581 |
+
{
|
| 582 |
+
#ifdef __CUDA_ARCH__
|
| 583 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 584 |
+
__nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
|
| 585 |
+
return temp;
|
| 586 |
+
#endif
|
| 587 |
+
}
|
| 588 |
+
|
| 589 |
+
template <typename T>
|
| 590 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 591 |
+
{
|
| 592 |
+
#ifdef __CUDA_ARCH__
|
| 593 |
+
T type_dummy;
|
| 594 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 595 |
+
__nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t, x, y, z, &dPdx, &dPdy);
|
| 596 |
+
return retval;
|
| 597 |
+
#endif /* __CUDA_ARCH__ */
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
// texCubemapLayeredGrad
|
| 602 |
+
template <typename T>
|
| 603 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
| 604 |
+
{
|
| 605 |
+
#ifdef __CUDA_ARCH__
|
| 606 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 607 |
+
__nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy);
|
| 608 |
+
return temp;
|
| 609 |
+
#endif
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
template <typename T>
|
| 613 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
| 614 |
+
{
|
| 615 |
+
#ifdef __CUDA_ARCH__
|
| 616 |
+
T type_dummy;
|
| 617 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 618 |
+
__nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy);
|
| 619 |
+
return retval;
|
| 620 |
+
#endif /* __CUDA_ARCH__ */
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
// tex1DGrad
|
| 625 |
+
template <typename T>
|
| 626 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy)
|
| 627 |
+
{
|
| 628 |
+
#ifdef __CUDA_ARCH__
|
| 629 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 630 |
+
__nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy);
|
| 631 |
+
return temp;
|
| 632 |
+
#endif
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
template <typename T>
|
| 636 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy)
|
| 637 |
+
{
|
| 638 |
+
#ifdef __CUDA_ARCH__
|
| 639 |
+
T type_dummy;
|
| 640 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 641 |
+
__nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy);
|
| 642 |
+
return retval;
|
| 643 |
+
#endif /* __CUDA_ARCH__ */
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
// tex2DGrad
|
| 648 |
+
template <typename T>
|
| 649 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy)
|
| 650 |
+
{
|
| 651 |
+
#ifdef __CUDA_ARCH__
|
| 652 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 653 |
+
__nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy);
|
| 654 |
+
return temp;
|
| 655 |
+
#endif
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
template <typename T>
|
| 659 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy)
|
| 660 |
+
{
|
| 661 |
+
#ifdef __CUDA_ARCH__
|
| 662 |
+
T type_dummy;
|
| 663 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 664 |
+
__nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy);
|
| 665 |
+
return retval;
|
| 666 |
+
#endif /* __CUDA_ARCH__ */
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
// tex1DLayeredGrad
|
| 670 |
+
template <typename T>
|
| 671 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy)
|
| 672 |
+
{
|
| 673 |
+
#ifdef __CUDA_ARCH__
|
| 674 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 675 |
+
__nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy);
|
| 676 |
+
return temp;
|
| 677 |
+
#endif
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
template <typename T>
|
| 681 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy)
|
| 682 |
+
{
|
| 683 |
+
#ifdef __CUDA_ARCH__
|
| 684 |
+
T type_dummy;
|
| 685 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 686 |
+
__nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy);
|
| 687 |
+
return retval;
|
| 688 |
+
#endif /* __CUDA_ARCH__ */
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
// tex2DLayeredGrad
|
| 692 |
+
template <typename T>
|
| 693 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
| 694 |
+
{
|
| 695 |
+
#ifdef __CUDA_ARCH__
|
| 696 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 697 |
+
__nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy);
|
| 698 |
+
return temp;
|
| 699 |
+
#endif
|
| 700 |
+
}
|
| 701 |
+
|
| 702 |
+
template <typename T>
|
| 703 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
| 704 |
+
{
|
| 705 |
+
#ifdef __CUDA_ARCH__
|
| 706 |
+
T type_dummy;
|
| 707 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 708 |
+
__nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy);
|
| 709 |
+
return retval;
|
| 710 |
+
#endif /* __CUDA_ARCH__ */
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
// tex3DGrad
|
| 714 |
+
template <typename T>
|
| 715 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 716 |
+
{
|
| 717 |
+
#ifdef __CUDA_ARCH__
|
| 718 |
+
typename __nv_tex_rmet_ret<T>::type temp;
|
| 719 |
+
__nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
|
| 720 |
+
return temp;
|
| 721 |
+
#endif
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
template <typename T>
|
| 725 |
+
static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 726 |
+
{
|
| 727 |
+
#ifdef __CUDA_ARCH__
|
| 728 |
+
T type_dummy;
|
| 729 |
+
typename __nv_tex_rmnf_ret<T>::type retval;
|
| 730 |
+
__nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy);
|
| 731 |
+
return retval;
|
| 732 |
+
#endif /* __CUDA_ARCH__ */
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
#undef __DEPRECATED__
|
| 736 |
+
|
| 737 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 738 |
+
|
| 739 |
+
#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (224 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/* cudnn : Neural Networks Library
|
| 51 |
+
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_H_)
|
| 55 |
+
#define CUDNN_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
#include "cudnn_ops_infer.h"
|
| 62 |
+
#include "cudnn_ops_train.h"
|
| 63 |
+
#include "cudnn_adv_infer.h"
|
| 64 |
+
#include "cudnn_adv_train.h"
|
| 65 |
+
#include "cudnn_cnn_infer.h"
|
| 66 |
+
#include "cudnn_cnn_train.h"
|
| 67 |
+
|
| 68 |
+
#include "cudnn_backend.h"
|
| 69 |
+
|
| 70 |
+
#if defined(__cplusplus)
|
| 71 |
+
extern "C" {
|
| 72 |
+
#endif
|
| 73 |
+
|
| 74 |
+
#if defined(__cplusplus)
|
| 75 |
+
}
|
| 76 |
+
#endif
|
| 77 |
+
|
| 78 |
+
#endif /* CUDNN_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h
ADDED
|
@@ -0,0 +1,658 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/* cudnn_adv_infer : cuDNN's advanced and experimental features.
|
| 51 |
+
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_ADV_INFER_H_)
|
| 55 |
+
#define CUDNN_ADV_INFER_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
#include "cudnn_ops_infer.h"
|
| 62 |
+
|
| 63 |
+
/* These version numbers are autogenerated, do not edit manually. */
|
| 64 |
+
#define CUDNN_ADV_INFER_MAJOR 8
|
| 65 |
+
#define CUDNN_ADV_INFER_MINOR 7
|
| 66 |
+
#define CUDNN_ADV_INFER_PATCH 0
|
| 67 |
+
|
| 68 |
+
#if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \
|
| 69 |
+
(CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL)
|
| 70 |
+
#error Version mismatch in cuDNN ADV INFER!!!
|
| 71 |
+
#endif
|
| 72 |
+
|
| 73 |
+
#if defined(__cplusplus)
|
| 74 |
+
extern "C" {
|
| 75 |
+
#endif
|
| 76 |
+
|
| 77 |
+
/* BASIC RNN API */
|
| 78 |
+
|
| 79 |
+
typedef enum {
|
| 80 |
+
CUDNN_FWD_MODE_INFERENCE = 0,
|
| 81 |
+
CUDNN_FWD_MODE_TRAINING = 1,
|
| 82 |
+
} cudnnForwardMode_t;
|
| 83 |
+
|
| 84 |
+
typedef enum {
|
| 85 |
+
CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
|
| 86 |
+
CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
|
| 87 |
+
CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */
|
| 88 |
+
CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
|
| 89 |
+
} cudnnRNNMode_t;
|
| 90 |
+
|
| 91 |
+
typedef enum {
|
| 92 |
+
CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */
|
| 93 |
+
CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
|
| 94 |
+
CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */
|
| 95 |
+
CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */
|
| 96 |
+
} cudnnRNNBiasMode_t;
|
| 97 |
+
|
| 98 |
+
typedef enum {
|
| 99 |
+
CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
|
| 100 |
+
CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */
|
| 101 |
+
} cudnnDirectionMode_t;
|
| 102 |
+
|
| 103 |
+
typedef enum {
|
| 104 |
+
CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
|
| 105 |
+
CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */
|
| 106 |
+
} cudnnRNNInputMode_t;
|
| 107 |
+
|
| 108 |
+
typedef enum {
|
| 109 |
+
CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */
|
| 110 |
+
CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
|
| 111 |
+
} cudnnRNNClipMode_t;
|
| 112 |
+
|
| 113 |
+
typedef enum {
|
| 114 |
+
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */
|
| 115 |
+
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */
|
| 116 |
+
CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
|
| 117 |
+
} cudnnRNNDataLayout_t;
|
| 118 |
+
|
| 119 |
+
/* Legacy type for backward compatibility */
|
| 120 |
+
typedef unsigned cudnnRNNPaddingMode_t;
|
| 121 |
+
|
| 122 |
+
/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
|
| 123 |
+
#define CUDNN_RNN_PADDED_IO_DISABLED 0
|
| 124 |
+
#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
|
| 125 |
+
|
| 126 |
+
struct cudnnRNNStruct;
|
| 127 |
+
typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
|
| 128 |
+
|
| 129 |
+
struct cudnnPersistentRNNPlan;
|
| 130 |
+
typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t;
|
| 131 |
+
|
| 132 |
+
struct cudnnRNNDataStruct;
|
| 133 |
+
typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
|
| 134 |
+
|
| 135 |
+
cudnnStatus_t CUDNNWINAPI
|
| 136 |
+
cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
|
| 137 |
+
|
| 138 |
+
cudnnStatus_t CUDNNWINAPI
|
| 139 |
+
cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
|
| 140 |
+
|
| 141 |
+
cudnnStatus_t CUDNNWINAPI
|
| 142 |
+
cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
|
| 143 |
+
cudnnRNNAlgo_t algo,
|
| 144 |
+
cudnnRNNMode_t cellMode,
|
| 145 |
+
cudnnRNNBiasMode_t biasMode,
|
| 146 |
+
cudnnDirectionMode_t dirMode,
|
| 147 |
+
cudnnRNNInputMode_t inputMode,
|
| 148 |
+
cudnnDataType_t dataType,
|
| 149 |
+
cudnnDataType_t mathPrec,
|
| 150 |
+
cudnnMathType_t mathType,
|
| 151 |
+
int32_t inputSize,
|
| 152 |
+
int32_t hiddenSize,
|
| 153 |
+
int32_t projSize,
|
| 154 |
+
int32_t numLayers,
|
| 155 |
+
cudnnDropoutDescriptor_t dropoutDesc,
|
| 156 |
+
uint32_t auxFlags);
|
| 157 |
+
|
| 158 |
+
cudnnStatus_t CUDNNWINAPI
|
| 159 |
+
cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
|
| 160 |
+
cudnnRNNAlgo_t *algo,
|
| 161 |
+
cudnnRNNMode_t *cellMode,
|
| 162 |
+
cudnnRNNBiasMode_t *biasMode,
|
| 163 |
+
cudnnDirectionMode_t *dirMode,
|
| 164 |
+
cudnnRNNInputMode_t *inputMode,
|
| 165 |
+
cudnnDataType_t *dataType,
|
| 166 |
+
cudnnDataType_t *mathPrec,
|
| 167 |
+
cudnnMathType_t *mathType,
|
| 168 |
+
int32_t *inputSize,
|
| 169 |
+
int32_t *hiddenSize,
|
| 170 |
+
int32_t *projSize,
|
| 171 |
+
int32_t *numLayers,
|
| 172 |
+
cudnnDropoutDescriptor_t *dropoutDesc,
|
| 173 |
+
uint32_t *auxFlags);
|
| 174 |
+
|
| 175 |
+
/*
|
| 176 |
+
* mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision
|
| 177 |
+
* compute precision is further modified by cudnnSetRNNMatrixMathType()
|
| 178 |
+
* dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage
|
| 179 |
+
* dropout is between RNN layers, not between recurrent steps
|
| 180 |
+
*/
|
| 181 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 182 |
+
cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
|
| 183 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 184 |
+
const int hiddenSize,
|
| 185 |
+
const int numLayers,
|
| 186 |
+
cudnnDropoutDescriptor_t dropoutDesc,
|
| 187 |
+
cudnnRNNInputMode_t inputMode,
|
| 188 |
+
cudnnDirectionMode_t direction,
|
| 189 |
+
cudnnRNNMode_t cellMode,
|
| 190 |
+
cudnnRNNAlgo_t algo,
|
| 191 |
+
cudnnDataType_t mathPrec);
|
| 192 |
+
|
| 193 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 194 |
+
cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
|
| 195 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 196 |
+
int *hiddenSize,
|
| 197 |
+
int *numLayers,
|
| 198 |
+
cudnnDropoutDescriptor_t *dropoutDesc,
|
| 199 |
+
cudnnRNNInputMode_t *inputMode,
|
| 200 |
+
cudnnDirectionMode_t *direction,
|
| 201 |
+
cudnnRNNMode_t *cellMode,
|
| 202 |
+
cudnnRNNAlgo_t *algo,
|
| 203 |
+
cudnnDataType_t *mathPrec);
|
| 204 |
+
|
| 205 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 206 |
+
cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType);
|
| 207 |
+
|
| 208 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 209 |
+
cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType);
|
| 210 |
+
|
| 211 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 212 |
+
cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode);
|
| 213 |
+
|
| 214 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 215 |
+
cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode);
|
| 216 |
+
|
| 217 |
+
cudnnStatus_t CUDNNWINAPI
|
| 218 |
+
cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
|
| 219 |
+
cudnnRNNClipMode_t clipMode,
|
| 220 |
+
cudnnNanPropagation_t clipNanOpt,
|
| 221 |
+
double lclip,
|
| 222 |
+
double rclip);
|
| 223 |
+
|
| 224 |
+
cudnnStatus_t CUDNNWINAPI
|
| 225 |
+
cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
|
| 226 |
+
cudnnRNNClipMode_t *clipMode,
|
| 227 |
+
cudnnNanPropagation_t *clipNanOpt,
|
| 228 |
+
double *lclip,
|
| 229 |
+
double *rclip);
|
| 230 |
+
|
| 231 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 232 |
+
cudnnRNNSetClip(cudnnHandle_t handle,
|
| 233 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 234 |
+
cudnnRNNClipMode_t clipMode,
|
| 235 |
+
cudnnNanPropagation_t clipNanOpt,
|
| 236 |
+
double lclip,
|
| 237 |
+
double rclip);
|
| 238 |
+
|
| 239 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 240 |
+
cudnnRNNGetClip(cudnnHandle_t handle,
|
| 241 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 242 |
+
cudnnRNNClipMode_t *clipMode,
|
| 243 |
+
cudnnNanPropagation_t *clipNanOpt,
|
| 244 |
+
double *lclip,
|
| 245 |
+
double *rclip);
|
| 246 |
+
|
| 247 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 248 |
+
cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
|
| 249 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 250 |
+
const int recProjSize,
|
| 251 |
+
const int outProjSize);
|
| 252 |
+
|
| 253 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 254 |
+
cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
|
| 255 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 256 |
+
int *recProjSize,
|
| 257 |
+
int *outProjSize);
|
| 258 |
+
|
| 259 |
+
/* Expensive. Creates the plan for the specific settings. */
|
| 260 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 261 |
+
cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
|
| 262 |
+
const int minibatch,
|
| 263 |
+
const cudnnDataType_t dataType,
|
| 264 |
+
cudnnPersistentRNNPlan_t *plan);
|
| 265 |
+
|
| 266 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 267 |
+
cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan);
|
| 268 |
+
|
| 269 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 270 |
+
cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan);
|
| 271 |
+
|
| 272 |
+
cudnnStatus_t CUDNNWINAPI
|
| 273 |
+
cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
|
| 274 |
+
|
| 275 |
+
/* dataType in weight descriptors and input descriptors is used to describe storage */
|
| 276 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 277 |
+
cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
|
| 278 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 279 |
+
const int seqLength,
|
| 280 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 281 |
+
size_t *sizeInBytes);
|
| 282 |
+
|
| 283 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 284 |
+
cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
|
| 285 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 286 |
+
const int seqLength,
|
| 287 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 288 |
+
size_t *sizeInBytes);
|
| 289 |
+
|
| 290 |
+
cudnnStatus_t CUDNNWINAPI
|
| 291 |
+
cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
|
| 292 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 293 |
+
cudnnForwardMode_t fMode,
|
| 294 |
+
cudnnRNNDataDescriptor_t xDesc,
|
| 295 |
+
size_t *workSpaceSize,
|
| 296 |
+
size_t *reserveSpaceSize);
|
| 297 |
+
|
| 298 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 299 |
+
cudnnGetRNNParamsSize(cudnnHandle_t handle,
|
| 300 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 301 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 302 |
+
size_t *sizeInBytes,
|
| 303 |
+
cudnnDataType_t dataType);
|
| 304 |
+
|
| 305 |
+
cudnnStatus_t CUDNNWINAPI
|
| 306 |
+
cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
|
| 307 |
+
|
| 308 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 309 |
+
cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
|
| 310 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 311 |
+
const int pseudoLayer,
|
| 312 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 313 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 314 |
+
const void *w,
|
| 315 |
+
const int linLayerID,
|
| 316 |
+
cudnnFilterDescriptor_t linLayerMatDesc,
|
| 317 |
+
void **linLayerMat);
|
| 318 |
+
|
| 319 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 320 |
+
cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
|
| 321 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 322 |
+
const int pseudoLayer,
|
| 323 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 324 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 325 |
+
const void *w,
|
| 326 |
+
const int linLayerID,
|
| 327 |
+
cudnnFilterDescriptor_t linLayerBiasDesc,
|
| 328 |
+
void **linLayerBias);
|
| 329 |
+
|
| 330 |
+
cudnnStatus_t CUDNNWINAPI
|
| 331 |
+
cudnnGetRNNWeightParams(cudnnHandle_t handle,
|
| 332 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 333 |
+
int32_t pseudoLayer,
|
| 334 |
+
size_t weightSpaceSize,
|
| 335 |
+
const void *weightSpace,
|
| 336 |
+
int32_t linLayerID,
|
| 337 |
+
cudnnTensorDescriptor_t mDesc,
|
| 338 |
+
void **mAddr,
|
| 339 |
+
cudnnTensorDescriptor_t bDesc,
|
| 340 |
+
void **bAddr);
|
| 341 |
+
|
| 342 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 343 |
+
cudnnRNNForwardInference(cudnnHandle_t handle,
|
| 344 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 345 |
+
const int seqLength,
|
| 346 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 347 |
+
const void *x,
|
| 348 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 349 |
+
const void *hx,
|
| 350 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 351 |
+
const void *cx,
|
| 352 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 353 |
+
const void *w,
|
| 354 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 355 |
+
void *y,
|
| 356 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 357 |
+
void *hy,
|
| 358 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 359 |
+
void *cy,
|
| 360 |
+
void *workSpace,
|
| 361 |
+
size_t workSpaceSizeInBytes);
|
| 362 |
+
|
| 363 |
+
/* RNN EX API */
|
| 364 |
+
|
| 365 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 366 |
+
cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode);
|
| 367 |
+
|
| 368 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 369 |
+
cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode);
|
| 370 |
+
|
| 371 |
+
cudnnStatus_t CUDNNWINAPI
|
| 372 |
+
cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
|
| 373 |
+
|
| 374 |
+
cudnnStatus_t CUDNNWINAPI
|
| 375 |
+
cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
|
| 376 |
+
|
| 377 |
+
cudnnStatus_t CUDNNWINAPI
|
| 378 |
+
cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
|
| 379 |
+
cudnnDataType_t dataType,
|
| 380 |
+
cudnnRNNDataLayout_t layout,
|
| 381 |
+
int maxSeqLength,
|
| 382 |
+
int batchSize,
|
| 383 |
+
int vectorSize,
|
| 384 |
+
const int seqLengthArray[], /* length of each sequence in the batch */
|
| 385 |
+
void *paddingFill); /* symbol for filling padding position in output */
|
| 386 |
+
|
| 387 |
+
cudnnStatus_t CUDNNWINAPI
|
| 388 |
+
cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
|
| 389 |
+
cudnnDataType_t *dataType,
|
| 390 |
+
cudnnRNNDataLayout_t *layout,
|
| 391 |
+
int *maxSeqLength,
|
| 392 |
+
int *batchSize,
|
| 393 |
+
int *vectorSize,
|
| 394 |
+
int arrayLengthRequested,
|
| 395 |
+
int seqLengthArray[],
|
| 396 |
+
void *paddingFill);
|
| 397 |
+
|
| 398 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 399 |
+
cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
|
| 400 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 401 |
+
const cudnnRNNDataDescriptor_t xDesc,
|
| 402 |
+
const void *x,
|
| 403 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 404 |
+
const void *hx,
|
| 405 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 406 |
+
const void *cx,
|
| 407 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 408 |
+
const void *w,
|
| 409 |
+
const cudnnRNNDataDescriptor_t yDesc,
|
| 410 |
+
void *y,
|
| 411 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 412 |
+
void *hy,
|
| 413 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 414 |
+
void *cy,
|
| 415 |
+
const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
|
| 416 |
+
const void *keys, /* reserved, should pass NULL */
|
| 417 |
+
const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
|
| 418 |
+
void *cAttn, /* reserved, should pass NULL */
|
| 419 |
+
const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
|
| 420 |
+
void *iAttn, /* reserved, should pass NULL */
|
| 421 |
+
const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
|
| 422 |
+
void *queries, /* reserved, should pass NULL */
|
| 423 |
+
void *workSpace,
|
| 424 |
+
size_t workSpaceSizeInBytes);
|
| 425 |
+
|
| 426 |
+
cudnnStatus_t CUDNNWINAPI
|
| 427 |
+
cudnnRNNForward(cudnnHandle_t handle,
|
| 428 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 429 |
+
cudnnForwardMode_t fwdMode,
|
| 430 |
+
const int32_t devSeqLengths[],
|
| 431 |
+
cudnnRNNDataDescriptor_t xDesc,
|
| 432 |
+
const void *x,
|
| 433 |
+
cudnnRNNDataDescriptor_t yDesc,
|
| 434 |
+
void *y,
|
| 435 |
+
cudnnTensorDescriptor_t hDesc,
|
| 436 |
+
const void *hx,
|
| 437 |
+
void *hy,
|
| 438 |
+
cudnnTensorDescriptor_t cDesc,
|
| 439 |
+
const void *cx,
|
| 440 |
+
void *cy,
|
| 441 |
+
size_t weightSpaceSize,
|
| 442 |
+
const void *weightSpace,
|
| 443 |
+
size_t workSpaceSize,
|
| 444 |
+
void *workSpace,
|
| 445 |
+
size_t reserveSpaceSize,
|
| 446 |
+
void *reserveSpace);
|
| 447 |
+
|
| 448 |
+
/* RNN FIND API */
|
| 449 |
+
|
| 450 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 451 |
+
cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc);
|
| 452 |
+
|
| 453 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 454 |
+
cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
|
| 455 |
+
|
| 456 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 457 |
+
cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
|
| 458 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 459 |
+
const int seqLength,
|
| 460 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 461 |
+
const void *x,
|
| 462 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 463 |
+
const void *hx,
|
| 464 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 465 |
+
const void *cx,
|
| 466 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 467 |
+
const void *w,
|
| 468 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 469 |
+
void *y,
|
| 470 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 471 |
+
void *hy,
|
| 472 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 473 |
+
void *cy,
|
| 474 |
+
const float findIntensity,
|
| 475 |
+
const int requestedAlgoCount,
|
| 476 |
+
int *returnedAlgoCount,
|
| 477 |
+
cudnnAlgorithmPerformance_t *perfResults,
|
| 478 |
+
void *workspace,
|
| 479 |
+
size_t workSpaceSizeInBytes);
|
| 480 |
+
|
| 481 |
+
/* Sequence data descriptor */
|
| 482 |
+
|
| 483 |
+
typedef enum {
|
| 484 |
+
CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */
|
| 485 |
+
CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
|
| 486 |
+
CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */
|
| 487 |
+
CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */
|
| 488 |
+
} cudnnSeqDataAxis_t;
|
| 489 |
+
|
| 490 |
+
struct cudnnSeqDataStruct;
|
| 491 |
+
typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t;
|
| 492 |
+
|
| 493 |
+
#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
|
| 494 |
+
|
| 495 |
+
cudnnStatus_t CUDNNWINAPI
|
| 496 |
+
cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
|
| 497 |
+
|
| 498 |
+
cudnnStatus_t CUDNNWINAPI
|
| 499 |
+
cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
|
| 500 |
+
|
| 501 |
+
cudnnStatus_t CUDNNWINAPI
|
| 502 |
+
cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
|
| 503 |
+
cudnnDataType_t dataType,
|
| 504 |
+
int nbDims,
|
| 505 |
+
const int dimA[],
|
| 506 |
+
const cudnnSeqDataAxis_t axes[],
|
| 507 |
+
size_t seqLengthArraySize,
|
| 508 |
+
const int seqLengthArray[],
|
| 509 |
+
void *paddingFill);
|
| 510 |
+
|
| 511 |
+
cudnnStatus_t CUDNNWINAPI
|
| 512 |
+
cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
|
| 513 |
+
cudnnDataType_t *dataType,
|
| 514 |
+
int *nbDims,
|
| 515 |
+
int nbDimsRequested,
|
| 516 |
+
int dimA[],
|
| 517 |
+
cudnnSeqDataAxis_t axes[],
|
| 518 |
+
size_t *seqLengthArraySize,
|
| 519 |
+
size_t seqLengthSizeRequested,
|
| 520 |
+
int seqLengthArray[],
|
| 521 |
+
void *paddingFill);
|
| 522 |
+
|
| 523 |
+
/* Multihead Attention */
|
| 524 |
+
|
| 525 |
+
/* Legacy type for backward compatibility */
|
| 526 |
+
typedef unsigned cudnnAttnQueryMap_t;
|
| 527 |
+
|
| 528 |
+
/*
|
| 529 |
+
* Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
|
| 530 |
+
* Use the bitwise OR operator to combine several settings listed below. Additional
|
| 531 |
+
* minor options can be added here w/o changing or introducing new API functions.
|
| 532 |
+
*/
|
| 533 |
+
#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */
|
| 534 |
+
#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
|
| 535 |
+
#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */
|
| 536 |
+
#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */
|
| 537 |
+
|
| 538 |
+
struct cudnnAttnStruct;
|
| 539 |
+
typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t;
|
| 540 |
+
|
| 541 |
+
cudnnStatus_t CUDNNWINAPI
|
| 542 |
+
cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
|
| 543 |
+
|
| 544 |
+
cudnnStatus_t CUDNNWINAPI
|
| 545 |
+
cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
|
| 546 |
+
|
| 547 |
+
cudnnStatus_t CUDNNWINAPI
|
| 548 |
+
cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
|
| 549 |
+
unsigned attnMode,
|
| 550 |
+
int nHeads,
|
| 551 |
+
double smScaler,
|
| 552 |
+
cudnnDataType_t dataType,
|
| 553 |
+
cudnnDataType_t computePrec,
|
| 554 |
+
cudnnMathType_t mathType,
|
| 555 |
+
cudnnDropoutDescriptor_t attnDropoutDesc,
|
| 556 |
+
cudnnDropoutDescriptor_t postDropoutDesc,
|
| 557 |
+
int qSize,
|
| 558 |
+
int kSize,
|
| 559 |
+
int vSize,
|
| 560 |
+
int qProjSize,
|
| 561 |
+
int kProjSize,
|
| 562 |
+
int vProjSize,
|
| 563 |
+
int oProjSize,
|
| 564 |
+
int qoMaxSeqLength,
|
| 565 |
+
int kvMaxSeqLength,
|
| 566 |
+
int maxBatchSize,
|
| 567 |
+
int maxBeamSize);
|
| 568 |
+
|
| 569 |
+
cudnnStatus_t CUDNNWINAPI
|
| 570 |
+
cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
|
| 571 |
+
unsigned *attnMode,
|
| 572 |
+
int *nHeads,
|
| 573 |
+
double *smScaler,
|
| 574 |
+
cudnnDataType_t *dataType,
|
| 575 |
+
cudnnDataType_t *computePrec,
|
| 576 |
+
cudnnMathType_t *mathType,
|
| 577 |
+
cudnnDropoutDescriptor_t *attnDropoutDesc,
|
| 578 |
+
cudnnDropoutDescriptor_t *postDropoutDesc,
|
| 579 |
+
int *qSize,
|
| 580 |
+
int *kSize,
|
| 581 |
+
int *vSize,
|
| 582 |
+
int *qProjSize,
|
| 583 |
+
int *kProjSize,
|
| 584 |
+
int *vProjSize,
|
| 585 |
+
int *oProjSize,
|
| 586 |
+
int *qoMaxSeqLength,
|
| 587 |
+
int *kvMaxSeqLength,
|
| 588 |
+
int *maxBatchSize,
|
| 589 |
+
int *maxBeamSize);
|
| 590 |
+
|
| 591 |
+
cudnnStatus_t CUDNNWINAPI
|
| 592 |
+
cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
|
| 593 |
+
const cudnnAttnDescriptor_t attnDesc,
|
| 594 |
+
size_t *weightSizeInBytes,
|
| 595 |
+
size_t *workSpaceSizeInBytes,
|
| 596 |
+
size_t *reserveSpaceSizeInBytes);
|
| 597 |
+
|
| 598 |
+
typedef enum {
|
| 599 |
+
CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
|
| 600 |
+
CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
|
| 601 |
+
CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
|
| 602 |
+
CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
|
| 603 |
+
CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */
|
| 604 |
+
CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */
|
| 605 |
+
CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */
|
| 606 |
+
CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */
|
| 607 |
+
} cudnnMultiHeadAttnWeightKind_t;
|
| 608 |
+
|
| 609 |
+
#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
|
| 610 |
+
|
| 611 |
+
cudnnStatus_t CUDNNWINAPI
|
| 612 |
+
cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
|
| 613 |
+
const cudnnAttnDescriptor_t attnDesc,
|
| 614 |
+
cudnnMultiHeadAttnWeightKind_t wKind,
|
| 615 |
+
size_t weightSizeInBytes,
|
| 616 |
+
const void *weights,
|
| 617 |
+
cudnnTensorDescriptor_t wDesc,
|
| 618 |
+
void **wAddr);
|
| 619 |
+
|
| 620 |
+
cudnnStatus_t CUDNNWINAPI
|
| 621 |
+
cudnnMultiHeadAttnForward(cudnnHandle_t handle,
|
| 622 |
+
const cudnnAttnDescriptor_t attnDesc,
|
| 623 |
+
int currIdx,
|
| 624 |
+
const int loWinIdx[],
|
| 625 |
+
const int hiWinIdx[],
|
| 626 |
+
const int devSeqLengthsQO[],
|
| 627 |
+
const int devSeqLengthsKV[],
|
| 628 |
+
const cudnnSeqDataDescriptor_t qDesc,
|
| 629 |
+
const void *queries,
|
| 630 |
+
const void *residuals,
|
| 631 |
+
const cudnnSeqDataDescriptor_t kDesc,
|
| 632 |
+
const void *keys,
|
| 633 |
+
const cudnnSeqDataDescriptor_t vDesc,
|
| 634 |
+
const void *values,
|
| 635 |
+
const cudnnSeqDataDescriptor_t oDesc,
|
| 636 |
+
void *out,
|
| 637 |
+
size_t weightSizeInBytes,
|
| 638 |
+
const void *weights,
|
| 639 |
+
size_t workSpaceSizeInBytes,
|
| 640 |
+
void *workSpace,
|
| 641 |
+
size_t reserveSpaceSizeInBytes,
|
| 642 |
+
void *reserveSpace);
|
| 643 |
+
|
| 644 |
+
/*
|
| 645 |
+
* \brief Cross-library version checker.
|
| 646 |
+
* This function is implemented differently in each sub-library. Each sublib
|
| 647 |
+
* checks whether its own version matches that of its dependencies.
|
| 648 |
+
* \returns CUDNN_STATUS_SUCCESS if the version check passes,
|
| 649 |
+
* CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
|
| 650 |
+
*/
|
| 651 |
+
cudnnStatus_t CUDNNWINAPI
|
| 652 |
+
cudnnAdvInferVersionCheck(void);
|
| 653 |
+
|
| 654 |
+
#if defined(__cplusplus)
|
| 655 |
+
}
|
| 656 |
+
#endif
|
| 657 |
+
|
| 658 |
+
#endif /* CUDNN_ADV_INFER_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (220 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (220 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*************************************************************************
|
| 2 |
+
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* See LICENSE.txt for license information
|
| 5 |
+
************************************************************************/
|
| 6 |
+
|
| 7 |
+
#ifndef NCCL_H_
|
| 8 |
+
#define NCCL_H_
|
| 9 |
+
|
| 10 |
+
#include <cuda_runtime.h>
|
| 11 |
+
#include <cuda_fp16.h>
|
| 12 |
+
#if CUDART_VERSION >= 11000
|
| 13 |
+
#include <cuda_bf16.h>
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#define NCCL_MAJOR 2
|
| 17 |
+
#define NCCL_MINOR 20
|
| 18 |
+
#define NCCL_PATCH 5
|
| 19 |
+
#define NCCL_SUFFIX ""
|
| 20 |
+
|
| 21 |
+
#define NCCL_VERSION_CODE 22005
|
| 22 |
+
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
| 23 |
+
|
| 24 |
+
#ifdef __cplusplus
|
| 25 |
+
extern "C" {
|
| 26 |
+
#endif
|
| 27 |
+
|
| 28 |
+
#include <limits.h>
|
| 29 |
+
/* Opaque handle to communicator */
|
| 30 |
+
typedef struct ncclComm* ncclComm_t;
|
| 31 |
+
#define NCCL_COMM_NULL NULL
|
| 32 |
+
|
| 33 |
+
#define NCCL_UNIQUE_ID_BYTES 128
|
| 34 |
+
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
| 35 |
+
|
| 36 |
+
/* Error type */
|
| 37 |
+
typedef enum { ncclSuccess = 0,
|
| 38 |
+
ncclUnhandledCudaError = 1,
|
| 39 |
+
ncclSystemError = 2,
|
| 40 |
+
ncclInternalError = 3,
|
| 41 |
+
ncclInvalidArgument = 4,
|
| 42 |
+
ncclInvalidUsage = 5,
|
| 43 |
+
ncclRemoteError = 6,
|
| 44 |
+
ncclInProgress = 7,
|
| 45 |
+
ncclNumResults = 8 } ncclResult_t;
|
| 46 |
+
|
| 47 |
+
#define NCCL_CONFIG_UNDEF_INT INT_MIN
|
| 48 |
+
#define NCCL_CONFIG_UNDEF_PTR NULL
|
| 49 |
+
#define NCCL_SPLIT_NOCOLOR -1
|
| 50 |
+
|
| 51 |
+
/* Communicator configuration. Users can assign value to attributes to specify the
|
| 52 |
+
* behavior of a communicator. */
|
| 53 |
+
typedef struct ncclConfig_v21700 {
|
| 54 |
+
/* attributes that users should never touch. */
|
| 55 |
+
size_t size;
|
| 56 |
+
unsigned int magic;
|
| 57 |
+
unsigned int version;
|
| 58 |
+
/* attributes that users are able to customize. */
|
| 59 |
+
int blocking;
|
| 60 |
+
int cgaClusterSize;
|
| 61 |
+
int minCTAs;
|
| 62 |
+
int maxCTAs;
|
| 63 |
+
const char *netName;
|
| 64 |
+
int splitShare;
|
| 65 |
+
} ncclConfig_t;
|
| 66 |
+
|
| 67 |
+
/* Config initializer must be assigned to initialize config structure when it is created.
|
| 68 |
+
* Not initialized config will result in NCCL error. */
|
| 69 |
+
#define NCCL_CONFIG_INITIALIZER { \
|
| 70 |
+
sizeof(ncclConfig_t), /* size */ \
|
| 71 |
+
0xcafebeef, /* magic */ \
|
| 72 |
+
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
| 73 |
+
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
|
| 74 |
+
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
|
| 75 |
+
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
|
| 76 |
+
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
|
| 77 |
+
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
|
| 78 |
+
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/* NCCL malloc and free function for all types of NCCL optimizations
|
| 82 |
+
* (e.g. user buffer registration). The actual allocated size might
|
| 83 |
+
* be larger than requested due to granularity requirement. */
|
| 84 |
+
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
|
| 85 |
+
ncclResult_t pncclMemAlloc(void** ptr, size_t size);
|
| 86 |
+
|
| 87 |
+
ncclResult_t ncclMemFree(void *ptr);
|
| 88 |
+
ncclResult_t pncclMemFree(void *ptr);
|
| 89 |
+
|
| 90 |
+
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
| 91 |
+
* This integer is coded with the MAJOR, MINOR and PATCH level of the
|
| 92 |
+
* NCCL library
|
| 93 |
+
*/
|
| 94 |
+
ncclResult_t ncclGetVersion(int *version);
|
| 95 |
+
ncclResult_t pncclGetVersion(int *version);
|
| 96 |
+
|
| 97 |
+
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
|
| 98 |
+
* called once and the Id should be distributed to all ranks in the
|
| 99 |
+
* communicator before calling ncclCommInitRank. */
|
| 100 |
+
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
| 101 |
+
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
| 102 |
+
|
| 103 |
+
/* Create a new communicator (multi thread/process version) with a configuration
|
| 104 |
+
* set by users. */
|
| 105 |
+
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
| 106 |
+
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
| 107 |
+
|
| 108 |
+
/* Creates a new communicator (multi thread/process version).
|
| 109 |
+
* rank must be between 0 and nranks-1 and unique within a communicator clique.
|
| 110 |
+
* Each rank is associated to a CUDA device, which has to be set before calling
|
| 111 |
+
* ncclCommInitRank.
|
| 112 |
+
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
| 113 |
+
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
|
| 114 |
+
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
| 115 |
+
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
| 116 |
+
|
| 117 |
+
/* Creates a clique of communicators (single process version).
|
| 118 |
+
* This is a convenience function to create a single-process communicator clique.
|
| 119 |
+
* Returns an array of ndev newly initialized communicators in comm.
|
| 120 |
+
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
|
| 121 |
+
* If devlist is NULL, the first ndev CUDA devices are used.
|
| 122 |
+
* Order of devlist defines user-order of processors within the communicator. */
|
| 123 |
+
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
| 124 |
+
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
| 125 |
+
|
| 126 |
+
/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
|
| 127 |
+
* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
|
| 128 |
+
* when the communicator is globally quiescent and related resources are freed; then,
|
| 129 |
+
* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
|
| 130 |
+
* itself) without blocking. */
|
| 131 |
+
ncclResult_t ncclCommFinalize(ncclComm_t comm);
|
| 132 |
+
ncclResult_t pncclCommFinalize(ncclComm_t comm);
|
| 133 |
+
|
| 134 |
+
/* Frees local resources associated with communicator object. */
|
| 135 |
+
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
| 136 |
+
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
| 137 |
+
|
| 138 |
+
/* Frees resources associated with communicator object and aborts any operations
|
| 139 |
+
* that might still be running on the device. */
|
| 140 |
+
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
| 141 |
+
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
| 142 |
+
|
| 143 |
+
/* Creates one or more communicators from an existing one.
|
| 144 |
+
* Ranks with the same color will end up in the same communicator.
|
| 145 |
+
* Within the new communicator, key will be used to order ranks.
|
| 146 |
+
* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
|
| 147 |
+
* and will therefore return a NULL communicator.
|
| 148 |
+
* If config is NULL, the new communicator will inherit the original communicator's
|
| 149 |
+
* configuration*/
|
| 150 |
+
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
|
| 151 |
+
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
|
| 152 |
+
|
| 153 |
+
/* Returns a string for each error code. */
|
| 154 |
+
const char* ncclGetErrorString(ncclResult_t result);
|
| 155 |
+
const char* pncclGetErrorString(ncclResult_t result);
|
| 156 |
+
|
| 157 |
+
/* Returns a human-readable message of the last error that occurred. */
|
| 158 |
+
const char* ncclGetLastError(ncclComm_t comm);
|
| 159 |
+
const char* pncclGetLastError(ncclComm_t comm);
|
| 160 |
+
|
| 161 |
+
/* Checks whether the comm has encountered any asynchronous errors */
|
| 162 |
+
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
| 163 |
+
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
| 164 |
+
|
| 165 |
+
/* Gets the number of ranks in the communicator clique. */
|
| 166 |
+
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
| 167 |
+
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
| 168 |
+
|
| 169 |
+
/* Returns the cuda device number associated with the communicator. */
|
| 170 |
+
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
| 171 |
+
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
| 172 |
+
|
| 173 |
+
/* Returns the user-ordered "rank" associated with the communicator. */
|
| 174 |
+
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
| 175 |
+
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
/* Register CUDA buffer for zero-copy operation */
|
| 179 |
+
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
| 180 |
+
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
| 181 |
+
|
| 182 |
+
/* Deregister CUDA buffer */
|
| 183 |
+
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
| 184 |
+
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
| 185 |
+
|
| 186 |
+
/* Reduction operation selector */
|
| 187 |
+
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
|
| 188 |
+
typedef enum { ncclSum = 0,
|
| 189 |
+
ncclProd = 1,
|
| 190 |
+
ncclMax = 2,
|
| 191 |
+
ncclMin = 3,
|
| 192 |
+
ncclAvg = 4,
|
| 193 |
+
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
|
| 194 |
+
* serves as the least possible value for dynamic ncclRedOp_t's
|
| 195 |
+
* as constructed by ncclRedOpCreate*** functions. */
|
| 196 |
+
ncclNumOps = 5,
|
| 197 |
+
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
|
| 198 |
+
* It is defined to be the largest signed value (since compilers
|
| 199 |
+
* are permitted to use signed enums) that won't grow
|
| 200 |
+
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
|
| 201 |
+
* maintain ABI compatibility. */
|
| 202 |
+
ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
|
| 203 |
+
} ncclRedOp_t;
|
| 204 |
+
|
| 205 |
+
/* Data types */
|
| 206 |
+
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
| 207 |
+
ncclUint8 = 1,
|
| 208 |
+
ncclInt32 = 2, ncclInt = 2,
|
| 209 |
+
ncclUint32 = 3,
|
| 210 |
+
ncclInt64 = 4,
|
| 211 |
+
ncclUint64 = 5,
|
| 212 |
+
ncclFloat16 = 6, ncclHalf = 6,
|
| 213 |
+
ncclFloat32 = 7, ncclFloat = 7,
|
| 214 |
+
ncclFloat64 = 8, ncclDouble = 8,
|
| 215 |
+
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
| 216 |
+
ncclBfloat16 = 9,
|
| 217 |
+
ncclNumTypes = 10
|
| 218 |
+
#else
|
| 219 |
+
ncclNumTypes = 9
|
| 220 |
+
#endif
|
| 221 |
+
} ncclDataType_t;
|
| 222 |
+
|
| 223 |
+
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
| 224 |
+
typedef enum {
|
| 225 |
+
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
| 226 |
+
* dereferenced while the collective is running. */
|
| 227 |
+
ncclScalarDevice = 0,
|
| 228 |
+
|
| 229 |
+
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
|
| 230 |
+
* dereferenced before the ncclRedOpCreate***() function returns. */
|
| 231 |
+
ncclScalarHostImmediate = 1
|
| 232 |
+
} ncclScalarResidence_t;
|
| 233 |
+
|
| 234 |
+
/*
|
| 235 |
+
* ncclRedOpCreatePreMulSum
|
| 236 |
+
*
|
| 237 |
+
* Creates a new reduction operator which pre-multiplies input values by a given
|
| 238 |
+
* scalar locally before reducing them with peer values via summation. For use
|
| 239 |
+
* only with collectives launched against *comm* and *datatype*. The
|
| 240 |
+
* *residence* argument indicates how/when the memory pointed to by *scalar*
|
| 241 |
+
* will be dereferenced. Upon return, the newly created operator's handle
|
| 242 |
+
* is stored in *op*.
|
| 243 |
+
*/
|
| 244 |
+
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
| 245 |
+
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
| 246 |
+
|
| 247 |
+
/*
|
| 248 |
+
* ncclRedOpDestroy
|
| 249 |
+
*
|
| 250 |
+
* Destroys the reduction operator *op*. The operator must have been created by
|
| 251 |
+
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
| 252 |
+
* destroyed as soon as the last NCCL function which is given that operator returns.
|
| 253 |
+
*/
|
| 254 |
+
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
| 255 |
+
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
| 256 |
+
|
| 257 |
+
/*
|
| 258 |
+
* Collective communication operations
|
| 259 |
+
*
|
| 260 |
+
* Collective communication operations must be called separately for each
|
| 261 |
+
* communicator in a communicator clique.
|
| 262 |
+
*
|
| 263 |
+
* They return when operations have been enqueued on the CUDA stream.
|
| 264 |
+
*
|
| 265 |
+
* Since they may perform inter-CPU synchronization, each call has to be done
|
| 266 |
+
* from a different thread or process, or need to use Group Semantics (see
|
| 267 |
+
* below).
|
| 268 |
+
*/
|
| 269 |
+
|
| 270 |
+
/*
|
| 271 |
+
* Reduce
|
| 272 |
+
*
|
| 273 |
+
* Reduces data arrays of length count in sendbuff into recvbuff using op
|
| 274 |
+
* operation.
|
| 275 |
+
* recvbuff may be NULL on all calls except for root device.
|
| 276 |
+
* root is the rank (not the CUDA device) where data will reside after the
|
| 277 |
+
* operation is complete.
|
| 278 |
+
*
|
| 279 |
+
* In-place operation will happen if sendbuff == recvbuff.
|
| 280 |
+
*/
|
| 281 |
+
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
| 282 |
+
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
| 283 |
+
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
| 284 |
+
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
| 285 |
+
|
| 286 |
+
/*
|
| 287 |
+
* (deprecated) Broadcast (in-place)
|
| 288 |
+
*
|
| 289 |
+
* Copies count values from root to all other devices.
|
| 290 |
+
* root is the rank (not the CUDA device) where data resides before the
|
| 291 |
+
* operation is started.
|
| 292 |
+
*
|
| 293 |
+
* This operation is implicitely in place.
|
| 294 |
+
*/
|
| 295 |
+
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
| 296 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 297 |
+
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
| 298 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 299 |
+
|
| 300 |
+
/*
|
| 301 |
+
* Broadcast
|
| 302 |
+
*
|
| 303 |
+
* Copies count values from root to all other devices.
|
| 304 |
+
* root is the rank (not the CUDA device) where data resides before the
|
| 305 |
+
* operation is started.
|
| 306 |
+
*
|
| 307 |
+
* In-place operation will happen if sendbuff == recvbuff.
|
| 308 |
+
*/
|
| 309 |
+
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
| 310 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 311 |
+
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
| 312 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 313 |
+
|
| 314 |
+
/*
|
| 315 |
+
* All-Reduce
|
| 316 |
+
*
|
| 317 |
+
* Reduces data arrays of length count in sendbuff using op operation, and
|
| 318 |
+
* leaves identical copies of result on each recvbuff.
|
| 319 |
+
*
|
| 320 |
+
* In-place operation will happen if sendbuff == recvbuff.
|
| 321 |
+
*/
|
| 322 |
+
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
| 323 |
+
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
| 324 |
+
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
| 325 |
+
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
| 326 |
+
|
| 327 |
+
/*
|
| 328 |
+
* Reduce-Scatter
|
| 329 |
+
*
|
| 330 |
+
* Reduces data in sendbuff using op operation and leaves reduced result
|
| 331 |
+
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
| 332 |
+
* block of the result.
|
| 333 |
+
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
| 334 |
+
* should have a size of at least nranks*recvcount elements.
|
| 335 |
+
*
|
| 336 |
+
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
| 337 |
+
*/
|
| 338 |
+
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
| 339 |
+
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
| 340 |
+
cudaStream_t stream);
|
| 341 |
+
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
| 342 |
+
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
| 343 |
+
cudaStream_t stream);
|
| 344 |
+
|
| 345 |
+
/*
|
| 346 |
+
* All-Gather
|
| 347 |
+
*
|
| 348 |
+
* Each device gathers sendcount values from other GPUs into recvbuff,
|
| 349 |
+
* receiving data from rank i at offset i*sendcount.
|
| 350 |
+
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
| 351 |
+
* should have a size of at least nranks*sendcount elements.
|
| 352 |
+
*
|
| 353 |
+
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
| 354 |
+
*/
|
| 355 |
+
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
| 356 |
+
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
| 357 |
+
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
| 358 |
+
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
| 359 |
+
|
| 360 |
+
/*
|
| 361 |
+
* Send
|
| 362 |
+
*
|
| 363 |
+
* Send data from sendbuff to rank peer.
|
| 364 |
+
*
|
| 365 |
+
* Rank peer needs to call ncclRecv with the same datatype and the same count from this
|
| 366 |
+
* rank.
|
| 367 |
+
*
|
| 368 |
+
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
| 369 |
+
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
| 370 |
+
* ncclGroupEnd section.
|
| 371 |
+
*/
|
| 372 |
+
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
| 373 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 374 |
+
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
| 375 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 376 |
+
|
| 377 |
+
/*
|
| 378 |
+
* Receive
|
| 379 |
+
*
|
| 380 |
+
* Receive data from rank peer into recvbuff.
|
| 381 |
+
*
|
| 382 |
+
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
| 383 |
+
* rank.
|
| 384 |
+
*
|
| 385 |
+
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
| 386 |
+
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
| 387 |
+
* ncclGroupEnd section.
|
| 388 |
+
*/
|
| 389 |
+
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
| 390 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 391 |
+
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
| 392 |
+
ncclComm_t comm, cudaStream_t stream);
|
| 393 |
+
|
| 394 |
+
/*
|
| 395 |
+
* Group semantics
|
| 396 |
+
*
|
| 397 |
+
* When managing multiple GPUs from a single thread, and since NCCL collective
|
| 398 |
+
* calls may perform inter-CPU synchronization, we need to "group" calls for
|
| 399 |
+
* different ranks/devices into a single call.
|
| 400 |
+
*
|
| 401 |
+
* Grouping NCCL calls as being part of the same collective operation is done
|
| 402 |
+
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
|
| 403 |
+
* collective calls until the ncclGroupEnd call, which will wait for all calls
|
| 404 |
+
* to be complete. Note that for collective communication, ncclGroupEnd only
|
| 405 |
+
* guarantees that the operations are enqueued on the streams, not that
|
| 406 |
+
* the operation is effectively done.
|
| 407 |
+
*
|
| 408 |
+
* Both collective communication and ncclCommInitRank can be used in conjunction
|
| 409 |
+
* of ncclGroupStart/ncclGroupEnd, but not together.
|
| 410 |
+
*
|
| 411 |
+
* Group semantics also allow to fuse multiple operations on the same device
|
| 412 |
+
* to improve performance (for aggregated collective calls), or to permit
|
| 413 |
+
* concurrent progress of multiple send/receive operations.
|
| 414 |
+
*/
|
| 415 |
+
|
| 416 |
+
/*
|
| 417 |
+
* Group Start
|
| 418 |
+
*
|
| 419 |
+
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
| 420 |
+
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
| 421 |
+
* ncclGroupEnd.
|
| 422 |
+
*/
|
| 423 |
+
ncclResult_t ncclGroupStart();
|
| 424 |
+
ncclResult_t pncclGroupStart();
|
| 425 |
+
|
| 426 |
+
/*
|
| 427 |
+
* Group End
|
| 428 |
+
*
|
| 429 |
+
* End a group call. Start a fused NCCL operation consisting of all calls since
|
| 430 |
+
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
| 431 |
+
* need to be called after ncclGroupEnd.
|
| 432 |
+
*/
|
| 433 |
+
ncclResult_t ncclGroupEnd();
|
| 434 |
+
ncclResult_t pncclGroupEnd();
|
| 435 |
+
|
| 436 |
+
/* Register CUDA buffer for zero-copy operation */
|
| 437 |
+
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
| 438 |
+
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
| 439 |
+
|
| 440 |
+
/* Deregister CUDA buffer */
|
| 441 |
+
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
| 442 |
+
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
| 443 |
+
|
| 444 |
+
#ifdef __cplusplus
|
| 445 |
+
} // end extern "C"
|
| 446 |
+
#endif
|
| 447 |
+
|
| 448 |
+
#endif // end include guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: 2015 Eric Larson
|
| 2 |
+
#
|
| 3 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from argparse import ArgumentParser
|
| 8 |
+
from typing import TYPE_CHECKING
|
| 9 |
+
|
| 10 |
+
from pip._vendor import requests
|
| 11 |
+
|
| 12 |
+
from pip._vendor.cachecontrol.adapter import CacheControlAdapter
|
| 13 |
+
from pip._vendor.cachecontrol.cache import DictCache
|
| 14 |
+
from pip._vendor.cachecontrol.controller import logger
|
| 15 |
+
|
| 16 |
+
if TYPE_CHECKING:
|
| 17 |
+
from argparse import Namespace
|
| 18 |
+
|
| 19 |
+
from pip._vendor.cachecontrol.controller import CacheController
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def setup_logging() -> None:
|
| 23 |
+
logger.setLevel(logging.DEBUG)
|
| 24 |
+
handler = logging.StreamHandler()
|
| 25 |
+
logger.addHandler(handler)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_session() -> requests.Session:
|
| 29 |
+
adapter = CacheControlAdapter(
|
| 30 |
+
DictCache(), cache_etags=True, serializer=None, heuristic=None
|
| 31 |
+
)
|
| 32 |
+
sess = requests.Session()
|
| 33 |
+
sess.mount("http://", adapter)
|
| 34 |
+
sess.mount("https://", adapter)
|
| 35 |
+
|
| 36 |
+
sess.cache_controller = adapter.controller # type: ignore[attr-defined]
|
| 37 |
+
return sess
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_args() -> Namespace:
|
| 41 |
+
parser = ArgumentParser()
|
| 42 |
+
parser.add_argument("url", help="The URL to try and cache")
|
| 43 |
+
return parser.parse_args()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main() -> None:
|
| 47 |
+
args = get_args()
|
| 48 |
+
sess = get_session()
|
| 49 |
+
|
| 50 |
+
# Make a request to get a response
|
| 51 |
+
resp = sess.get(args.url)
|
| 52 |
+
|
| 53 |
+
# Turn on logging
|
| 54 |
+
setup_logging()
|
| 55 |
+
|
| 56 |
+
# try setting the cache
|
| 57 |
+
cache_controller: CacheController = (
|
| 58 |
+
sess.cache_controller # type: ignore[attr-defined]
|
| 59 |
+
)
|
| 60 |
+
cache_controller.cache_response(resp.request, resp.raw)
|
| 61 |
+
|
| 62 |
+
# Now try to get it
|
| 63 |
+
if cache_controller.cached_request(resp.request):
|
| 64 |
+
print("Cached!")
|
| 65 |
+
else:
|
| 66 |
+
print("Not cached :(")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
main()
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: 2015 Eric Larson
|
| 2 |
+
#
|
| 3 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import functools
|
| 7 |
+
import types
|
| 8 |
+
import zlib
|
| 9 |
+
from typing import TYPE_CHECKING, Any, Collection, Mapping
|
| 10 |
+
|
| 11 |
+
from pip._vendor.requests.adapters import HTTPAdapter
|
| 12 |
+
|
| 13 |
+
from pip._vendor.cachecontrol.cache import DictCache
|
| 14 |
+
from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController
|
| 15 |
+
from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper
|
| 16 |
+
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from pip._vendor.requests import PreparedRequest, Response
|
| 19 |
+
from pip._vendor.urllib3 import HTTPResponse
|
| 20 |
+
|
| 21 |
+
from pip._vendor.cachecontrol.cache import BaseCache
|
| 22 |
+
from pip._vendor.cachecontrol.heuristics import BaseHeuristic
|
| 23 |
+
from pip._vendor.cachecontrol.serialize import Serializer
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class CacheControlAdapter(HTTPAdapter):
|
| 27 |
+
invalidating_methods = {"PUT", "PATCH", "DELETE"}
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
cache: BaseCache | None = None,
|
| 32 |
+
cache_etags: bool = True,
|
| 33 |
+
controller_class: type[CacheController] | None = None,
|
| 34 |
+
serializer: Serializer | None = None,
|
| 35 |
+
heuristic: BaseHeuristic | None = None,
|
| 36 |
+
cacheable_methods: Collection[str] | None = None,
|
| 37 |
+
*args: Any,
|
| 38 |
+
**kw: Any,
|
| 39 |
+
) -> None:
|
| 40 |
+
super().__init__(*args, **kw)
|
| 41 |
+
self.cache = DictCache() if cache is None else cache
|
| 42 |
+
self.heuristic = heuristic
|
| 43 |
+
self.cacheable_methods = cacheable_methods or ("GET",)
|
| 44 |
+
|
| 45 |
+
controller_factory = controller_class or CacheController
|
| 46 |
+
self.controller = controller_factory(
|
| 47 |
+
self.cache, cache_etags=cache_etags, serializer=serializer
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def send(
|
| 51 |
+
self,
|
| 52 |
+
request: PreparedRequest,
|
| 53 |
+
stream: bool = False,
|
| 54 |
+
timeout: None | float | tuple[float, float] | tuple[float, None] = None,
|
| 55 |
+
verify: bool | str = True,
|
| 56 |
+
cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None,
|
| 57 |
+
proxies: Mapping[str, str] | None = None,
|
| 58 |
+
cacheable_methods: Collection[str] | None = None,
|
| 59 |
+
) -> Response:
|
| 60 |
+
"""
|
| 61 |
+
Send a request. Use the request information to see if it
|
| 62 |
+
exists in the cache and cache the response if we need to and can.
|
| 63 |
+
"""
|
| 64 |
+
cacheable = cacheable_methods or self.cacheable_methods
|
| 65 |
+
if request.method in cacheable:
|
| 66 |
+
try:
|
| 67 |
+
cached_response = self.controller.cached_request(request)
|
| 68 |
+
except zlib.error:
|
| 69 |
+
cached_response = None
|
| 70 |
+
if cached_response:
|
| 71 |
+
return self.build_response(request, cached_response, from_cache=True)
|
| 72 |
+
|
| 73 |
+
# check for etags and add headers if appropriate
|
| 74 |
+
request.headers.update(self.controller.conditional_headers(request))
|
| 75 |
+
|
| 76 |
+
resp = super().send(request, stream, timeout, verify, cert, proxies)
|
| 77 |
+
|
| 78 |
+
return resp
|
| 79 |
+
|
| 80 |
+
def build_response(
|
| 81 |
+
self,
|
| 82 |
+
request: PreparedRequest,
|
| 83 |
+
response: HTTPResponse,
|
| 84 |
+
from_cache: bool = False,
|
| 85 |
+
cacheable_methods: Collection[str] | None = None,
|
| 86 |
+
) -> Response:
|
| 87 |
+
"""
|
| 88 |
+
Build a response by making a request or using the cache.
|
| 89 |
+
|
| 90 |
+
This will end up calling send and returning a potentially
|
| 91 |
+
cached response
|
| 92 |
+
"""
|
| 93 |
+
cacheable = cacheable_methods or self.cacheable_methods
|
| 94 |
+
if not from_cache and request.method in cacheable:
|
| 95 |
+
# Check for any heuristics that might update headers
|
| 96 |
+
# before trying to cache.
|
| 97 |
+
if self.heuristic:
|
| 98 |
+
response = self.heuristic.apply(response)
|
| 99 |
+
|
| 100 |
+
# apply any expiration heuristics
|
| 101 |
+
if response.status == 304:
|
| 102 |
+
# We must have sent an ETag request. This could mean
|
| 103 |
+
# that we've been expired already or that we simply
|
| 104 |
+
# have an etag. In either case, we want to try and
|
| 105 |
+
# update the cache if that is the case.
|
| 106 |
+
cached_response = self.controller.update_cached_response(
|
| 107 |
+
request, response
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if cached_response is not response:
|
| 111 |
+
from_cache = True
|
| 112 |
+
|
| 113 |
+
# We are done with the server response, read a
|
| 114 |
+
# possible response body (compliant servers will
|
| 115 |
+
# not return one, but we cannot be 100% sure) and
|
| 116 |
+
# release the connection back to the pool.
|
| 117 |
+
response.read(decode_content=False)
|
| 118 |
+
response.release_conn()
|
| 119 |
+
|
| 120 |
+
response = cached_response
|
| 121 |
+
|
| 122 |
+
# We always cache the 301 responses
|
| 123 |
+
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
|
| 124 |
+
self.controller.cache_response(request, response)
|
| 125 |
+
else:
|
| 126 |
+
# Wrap the response file with a wrapper that will cache the
|
| 127 |
+
# response when the stream has been consumed.
|
| 128 |
+
response._fp = CallbackFileWrapper( # type: ignore[assignment]
|
| 129 |
+
response._fp, # type: ignore[arg-type]
|
| 130 |
+
functools.partial(
|
| 131 |
+
self.controller.cache_response, request, response
|
| 132 |
+
),
|
| 133 |
+
)
|
| 134 |
+
if response.chunked:
|
| 135 |
+
super_update_chunk_length = response._update_chunk_length
|
| 136 |
+
|
| 137 |
+
def _update_chunk_length(self: HTTPResponse) -> None:
|
| 138 |
+
super_update_chunk_length()
|
| 139 |
+
if self.chunk_left == 0:
|
| 140 |
+
self._fp._close() # type: ignore[union-attr]
|
| 141 |
+
|
| 142 |
+
response._update_chunk_length = types.MethodType( # type: ignore[method-assign]
|
| 143 |
+
_update_chunk_length, response
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
resp: Response = super().build_response(request, response) # type: ignore[no-untyped-call]
|
| 147 |
+
|
| 148 |
+
# See if we should invalidate the cache.
|
| 149 |
+
if request.method in self.invalidating_methods and resp.ok:
|
| 150 |
+
assert request.url is not None
|
| 151 |
+
cache_url = self.controller.cache_url(request.url)
|
| 152 |
+
self.cache.delete(cache_url)
|
| 153 |
+
|
| 154 |
+
# Give the request a from_cache attr to let people use it
|
| 155 |
+
resp.from_cache = from_cache # type: ignore[attr-defined]
|
| 156 |
+
|
| 157 |
+
return resp
|
| 158 |
+
|
| 159 |
+
def close(self) -> None:
|
| 160 |
+
self.cache.close()
|
| 161 |
+
super().close() # type: ignore[no-untyped-call]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: 2015 Eric Larson
|
| 2 |
+
#
|
| 3 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
The cache object API for implementing caches. The default is a thread
|
| 7 |
+
safe in-memory dictionary.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from threading import Lock
|
| 12 |
+
from typing import IO, TYPE_CHECKING, MutableMapping
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class BaseCache:
|
| 19 |
+
def get(self, key: str) -> bytes | None:
|
| 20 |
+
raise NotImplementedError()
|
| 21 |
+
|
| 22 |
+
def set(
|
| 23 |
+
self, key: str, value: bytes, expires: int | datetime | None = None
|
| 24 |
+
) -> None:
|
| 25 |
+
raise NotImplementedError()
|
| 26 |
+
|
| 27 |
+
def delete(self, key: str) -> None:
|
| 28 |
+
raise NotImplementedError()
|
| 29 |
+
|
| 30 |
+
def close(self) -> None:
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class DictCache(BaseCache):
|
| 35 |
+
def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None:
|
| 36 |
+
self.lock = Lock()
|
| 37 |
+
self.data = init_dict or {}
|
| 38 |
+
|
| 39 |
+
def get(self, key: str) -> bytes | None:
|
| 40 |
+
return self.data.get(key, None)
|
| 41 |
+
|
| 42 |
+
def set(
|
| 43 |
+
self, key: str, value: bytes, expires: int | datetime | None = None
|
| 44 |
+
) -> None:
|
| 45 |
+
with self.lock:
|
| 46 |
+
self.data.update({key: value})
|
| 47 |
+
|
| 48 |
+
def delete(self, key: str) -> None:
|
| 49 |
+
with self.lock:
|
| 50 |
+
if key in self.data:
|
| 51 |
+
self.data.pop(key)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class SeparateBodyBaseCache(BaseCache):
|
| 55 |
+
"""
|
| 56 |
+
In this variant, the body is not stored mixed in with the metadata, but is
|
| 57 |
+
passed in (as a bytes-like object) in a separate call to ``set_body()``.
|
| 58 |
+
|
| 59 |
+
That is, the expected interaction pattern is::
|
| 60 |
+
|
| 61 |
+
cache.set(key, serialized_metadata)
|
| 62 |
+
cache.set_body(key)
|
| 63 |
+
|
| 64 |
+
Similarly, the body should be loaded separately via ``get_body()``.
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def set_body(self, key: str, body: bytes) -> None:
|
| 68 |
+
raise NotImplementedError()
|
| 69 |
+
|
| 70 |
+
def get_body(self, key: str) -> IO[bytes] | None:
|
| 71 |
+
"""
|
| 72 |
+
Return the body as file-like object.
|
| 73 |
+
"""
|
| 74 |
+
raise NotImplementedError()
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: 2015 Eric Larson
|
| 2 |
+
#
|
| 3 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
The httplib2 algorithms ported for use with requests.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import calendar
|
| 11 |
+
import logging
|
| 12 |
+
import re
|
| 13 |
+
import time
|
| 14 |
+
from email.utils import parsedate_tz
|
| 15 |
+
from typing import TYPE_CHECKING, Collection, Mapping
|
| 16 |
+
|
| 17 |
+
from pip._vendor.requests.structures import CaseInsensitiveDict
|
| 18 |
+
|
| 19 |
+
from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache
|
| 20 |
+
from pip._vendor.cachecontrol.serialize import Serializer
|
| 21 |
+
|
| 22 |
+
if TYPE_CHECKING:
|
| 23 |
+
from typing import Literal
|
| 24 |
+
|
| 25 |
+
from pip._vendor.requests import PreparedRequest
|
| 26 |
+
from pip._vendor.urllib3 import HTTPResponse
|
| 27 |
+
|
| 28 |
+
from pip._vendor.cachecontrol.cache import BaseCache
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
|
| 33 |
+
|
| 34 |
+
PERMANENT_REDIRECT_STATUSES = (301, 308)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def parse_uri(uri: str) -> tuple[str, str, str, str, str]:
|
| 38 |
+
"""Parses a URI using the regex given in Appendix B of RFC 3986.
|
| 39 |
+
|
| 40 |
+
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
| 41 |
+
"""
|
| 42 |
+
match = URI.match(uri)
|
| 43 |
+
assert match is not None
|
| 44 |
+
groups = match.groups()
|
| 45 |
+
return (groups[1], groups[3], groups[4], groups[6], groups[8])
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class CacheController:
|
| 49 |
+
"""An interface to see if request should cached or not."""
|
| 50 |
+
|
| 51 |
+
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
cache: BaseCache | None = None,
|
| 54 |
+
cache_etags: bool = True,
|
| 55 |
+
serializer: Serializer | None = None,
|
| 56 |
+
status_codes: Collection[int] | None = None,
|
| 57 |
+
):
|
| 58 |
+
self.cache = DictCache() if cache is None else cache
|
| 59 |
+
self.cache_etags = cache_etags
|
| 60 |
+
self.serializer = serializer or Serializer()
|
| 61 |
+
self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
|
| 62 |
+
|
| 63 |
+
@classmethod
|
| 64 |
+
def _urlnorm(cls, uri: str) -> str:
|
| 65 |
+
"""Normalize the URL to create a safe key for the cache"""
|
| 66 |
+
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
| 67 |
+
if not scheme or not authority:
|
| 68 |
+
raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
|
| 69 |
+
|
| 70 |
+
scheme = scheme.lower()
|
| 71 |
+
authority = authority.lower()
|
| 72 |
+
|
| 73 |
+
if not path:
|
| 74 |
+
path = "/"
|
| 75 |
+
|
| 76 |
+
# Could do syntax based normalization of the URI before
|
| 77 |
+
# computing the digest. See Section 6.2.2 of Std 66.
|
| 78 |
+
request_uri = query and "?".join([path, query]) or path
|
| 79 |
+
defrag_uri = scheme + "://" + authority + request_uri
|
| 80 |
+
|
| 81 |
+
return defrag_uri
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def cache_url(cls, uri: str) -> str:
|
| 85 |
+
return cls._urlnorm(uri)
|
| 86 |
+
|
| 87 |
+
def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]:
|
| 88 |
+
known_directives = {
|
| 89 |
+
# https://tools.ietf.org/html/rfc7234#section-5.2
|
| 90 |
+
"max-age": (int, True),
|
| 91 |
+
"max-stale": (int, False),
|
| 92 |
+
"min-fresh": (int, True),
|
| 93 |
+
"no-cache": (None, False),
|
| 94 |
+
"no-store": (None, False),
|
| 95 |
+
"no-transform": (None, False),
|
| 96 |
+
"only-if-cached": (None, False),
|
| 97 |
+
"must-revalidate": (None, False),
|
| 98 |
+
"public": (None, False),
|
| 99 |
+
"private": (None, False),
|
| 100 |
+
"proxy-revalidate": (None, False),
|
| 101 |
+
"s-maxage": (int, True),
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
|
| 105 |
+
|
| 106 |
+
retval: dict[str, int | None] = {}
|
| 107 |
+
|
| 108 |
+
for cc_directive in cc_headers.split(","):
|
| 109 |
+
if not cc_directive.strip():
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
parts = cc_directive.split("=", 1)
|
| 113 |
+
directive = parts[0].strip()
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
typ, required = known_directives[directive]
|
| 117 |
+
except KeyError:
|
| 118 |
+
logger.debug("Ignoring unknown cache-control directive: %s", directive)
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
if not typ or not required:
|
| 122 |
+
retval[directive] = None
|
| 123 |
+
if typ:
|
| 124 |
+
try:
|
| 125 |
+
retval[directive] = typ(parts[1].strip())
|
| 126 |
+
except IndexError:
|
| 127 |
+
if required:
|
| 128 |
+
logger.debug(
|
| 129 |
+
"Missing value for cache-control " "directive: %s",
|
| 130 |
+
directive,
|
| 131 |
+
)
|
| 132 |
+
except ValueError:
|
| 133 |
+
logger.debug(
|
| 134 |
+
"Invalid value for cache-control directive " "%s, must be %s",
|
| 135 |
+
directive,
|
| 136 |
+
typ.__name__,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
return retval
|
| 140 |
+
|
| 141 |
+
def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None:
|
| 142 |
+
"""
|
| 143 |
+
Load a cached response, or return None if it's not available.
|
| 144 |
+
"""
|
| 145 |
+
# We do not support caching of partial content: so if the request contains a
|
| 146 |
+
# Range header then we don't want to load anything from the cache.
|
| 147 |
+
if "Range" in request.headers:
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
cache_url = request.url
|
| 151 |
+
assert cache_url is not None
|
| 152 |
+
cache_data = self.cache.get(cache_url)
|
| 153 |
+
if cache_data is None:
|
| 154 |
+
logger.debug("No cache entry available")
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
if isinstance(self.cache, SeparateBodyBaseCache):
|
| 158 |
+
body_file = self.cache.get_body(cache_url)
|
| 159 |
+
else:
|
| 160 |
+
body_file = None
|
| 161 |
+
|
| 162 |
+
result = self.serializer.loads(request, cache_data, body_file)
|
| 163 |
+
if result is None:
|
| 164 |
+
logger.warning("Cache entry deserialization failed, entry ignored")
|
| 165 |
+
return result
|
| 166 |
+
|
| 167 |
+
def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]:
|
| 168 |
+
"""
|
| 169 |
+
Return a cached response if it exists in the cache, otherwise
|
| 170 |
+
return False.
|
| 171 |
+
"""
|
| 172 |
+
assert request.url is not None
|
| 173 |
+
cache_url = self.cache_url(request.url)
|
| 174 |
+
logger.debug('Looking up "%s" in the cache', cache_url)
|
| 175 |
+
cc = self.parse_cache_control(request.headers)
|
| 176 |
+
|
| 177 |
+
# Bail out if the request insists on fresh data
|
| 178 |
+
if "no-cache" in cc:
|
| 179 |
+
logger.debug('Request header has "no-cache", cache bypassed')
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
if "max-age" in cc and cc["max-age"] == 0:
|
| 183 |
+
logger.debug('Request header has "max_age" as 0, cache bypassed')
|
| 184 |
+
return False
|
| 185 |
+
|
| 186 |
+
# Check whether we can load the response from the cache:
|
| 187 |
+
resp = self._load_from_cache(request)
|
| 188 |
+
if not resp:
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
# If we have a cached permanent redirect, return it immediately. We
|
| 192 |
+
# don't need to test our response for other headers b/c it is
|
| 193 |
+
# intrinsically "cacheable" as it is Permanent.
|
| 194 |
+
#
|
| 195 |
+
# See:
|
| 196 |
+
# https://tools.ietf.org/html/rfc7231#section-6.4.2
|
| 197 |
+
#
|
| 198 |
+
# Client can try to refresh the value by repeating the request
|
| 199 |
+
# with cache busting headers as usual (ie no-cache).
|
| 200 |
+
if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
|
| 201 |
+
msg = (
|
| 202 |
+
"Returning cached permanent redirect response "
|
| 203 |
+
"(ignoring date and etag information)"
|
| 204 |
+
)
|
| 205 |
+
logger.debug(msg)
|
| 206 |
+
return resp
|
| 207 |
+
|
| 208 |
+
headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
|
| 209 |
+
if not headers or "date" not in headers:
|
| 210 |
+
if "etag" not in headers:
|
| 211 |
+
# Without date or etag, the cached response can never be used
|
| 212 |
+
# and should be deleted.
|
| 213 |
+
logger.debug("Purging cached response: no date or etag")
|
| 214 |
+
self.cache.delete(cache_url)
|
| 215 |
+
logger.debug("Ignoring cached response: no date")
|
| 216 |
+
return False
|
| 217 |
+
|
| 218 |
+
now = time.time()
|
| 219 |
+
time_tuple = parsedate_tz(headers["date"])
|
| 220 |
+
assert time_tuple is not None
|
| 221 |
+
date = calendar.timegm(time_tuple[:6])
|
| 222 |
+
current_age = max(0, now - date)
|
| 223 |
+
logger.debug("Current age based on date: %i", current_age)
|
| 224 |
+
|
| 225 |
+
# TODO: There is an assumption that the result will be a
|
| 226 |
+
# urllib3 response object. This may not be best since we
|
| 227 |
+
# could probably avoid instantiating or constructing the
|
| 228 |
+
# response until we know we need it.
|
| 229 |
+
resp_cc = self.parse_cache_control(headers)
|
| 230 |
+
|
| 231 |
+
# determine freshness
|
| 232 |
+
freshness_lifetime = 0
|
| 233 |
+
|
| 234 |
+
# Check the max-age pragma in the cache control header
|
| 235 |
+
max_age = resp_cc.get("max-age")
|
| 236 |
+
if max_age is not None:
|
| 237 |
+
freshness_lifetime = max_age
|
| 238 |
+
logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
|
| 239 |
+
|
| 240 |
+
# If there isn't a max-age, check for an expires header
|
| 241 |
+
elif "expires" in headers:
|
| 242 |
+
expires = parsedate_tz(headers["expires"])
|
| 243 |
+
if expires is not None:
|
| 244 |
+
expire_time = calendar.timegm(expires[:6]) - date
|
| 245 |
+
freshness_lifetime = max(0, expire_time)
|
| 246 |
+
logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
|
| 247 |
+
|
| 248 |
+
# Determine if we are setting freshness limit in the
|
| 249 |
+
# request. Note, this overrides what was in the response.
|
| 250 |
+
max_age = cc.get("max-age")
|
| 251 |
+
if max_age is not None:
|
| 252 |
+
freshness_lifetime = max_age
|
| 253 |
+
logger.debug(
|
| 254 |
+
"Freshness lifetime from request max-age: %i", freshness_lifetime
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
min_fresh = cc.get("min-fresh")
|
| 258 |
+
if min_fresh is not None:
|
| 259 |
+
# adjust our current age by our min fresh
|
| 260 |
+
current_age += min_fresh
|
| 261 |
+
logger.debug("Adjusted current age from min-fresh: %i", current_age)
|
| 262 |
+
|
| 263 |
+
# Return entry if it is fresh enough
|
| 264 |
+
if freshness_lifetime > current_age:
|
| 265 |
+
logger.debug('The response is "fresh", returning cached response')
|
| 266 |
+
logger.debug("%i > %i", freshness_lifetime, current_age)
|
| 267 |
+
return resp
|
| 268 |
+
|
| 269 |
+
# we're not fresh. If we don't have an Etag, clear it out
|
| 270 |
+
if "etag" not in headers:
|
| 271 |
+
logger.debug('The cached response is "stale" with no etag, purging')
|
| 272 |
+
self.cache.delete(cache_url)
|
| 273 |
+
|
| 274 |
+
# return the original handler
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
def conditional_headers(self, request: PreparedRequest) -> dict[str, str]:
|
| 278 |
+
resp = self._load_from_cache(request)
|
| 279 |
+
new_headers = {}
|
| 280 |
+
|
| 281 |
+
if resp:
|
| 282 |
+
headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
|
| 283 |
+
|
| 284 |
+
if "etag" in headers:
|
| 285 |
+
new_headers["If-None-Match"] = headers["ETag"]
|
| 286 |
+
|
| 287 |
+
if "last-modified" in headers:
|
| 288 |
+
new_headers["If-Modified-Since"] = headers["Last-Modified"]
|
| 289 |
+
|
| 290 |
+
return new_headers
|
| 291 |
+
|
| 292 |
+
def _cache_set(
|
| 293 |
+
self,
|
| 294 |
+
cache_url: str,
|
| 295 |
+
request: PreparedRequest,
|
| 296 |
+
response: HTTPResponse,
|
| 297 |
+
body: bytes | None = None,
|
| 298 |
+
expires_time: int | None = None,
|
| 299 |
+
) -> None:
|
| 300 |
+
"""
|
| 301 |
+
Store the data in the cache.
|
| 302 |
+
"""
|
| 303 |
+
if isinstance(self.cache, SeparateBodyBaseCache):
|
| 304 |
+
# We pass in the body separately; just put a placeholder empty
|
| 305 |
+
# string in the metadata.
|
| 306 |
+
self.cache.set(
|
| 307 |
+
cache_url,
|
| 308 |
+
self.serializer.dumps(request, response, b""),
|
| 309 |
+
expires=expires_time,
|
| 310 |
+
)
|
| 311 |
+
# body is None can happen when, for example, we're only updating
|
| 312 |
+
# headers, as is the case in update_cached_response().
|
| 313 |
+
if body is not None:
|
| 314 |
+
self.cache.set_body(cache_url, body)
|
| 315 |
+
else:
|
| 316 |
+
self.cache.set(
|
| 317 |
+
cache_url,
|
| 318 |
+
self.serializer.dumps(request, response, body),
|
| 319 |
+
expires=expires_time,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
def cache_response(
|
| 323 |
+
self,
|
| 324 |
+
request: PreparedRequest,
|
| 325 |
+
response: HTTPResponse,
|
| 326 |
+
body: bytes | None = None,
|
| 327 |
+
status_codes: Collection[int] | None = None,
|
| 328 |
+
) -> None:
|
| 329 |
+
"""
|
| 330 |
+
Algorithm for caching requests.
|
| 331 |
+
|
| 332 |
+
This assumes a requests Response object.
|
| 333 |
+
"""
|
| 334 |
+
# From httplib2: Don't cache 206's since we aren't going to
|
| 335 |
+
# handle byte range requests
|
| 336 |
+
cacheable_status_codes = status_codes or self.cacheable_status_codes
|
| 337 |
+
if response.status not in cacheable_status_codes:
|
| 338 |
+
logger.debug(
|
| 339 |
+
"Status code %s not in %s", response.status, cacheable_status_codes
|
| 340 |
+
)
|
| 341 |
+
return
|
| 342 |
+
|
| 343 |
+
response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
|
| 344 |
+
response.headers
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
if "date" in response_headers:
|
| 348 |
+
time_tuple = parsedate_tz(response_headers["date"])
|
| 349 |
+
assert time_tuple is not None
|
| 350 |
+
date = calendar.timegm(time_tuple[:6])
|
| 351 |
+
else:
|
| 352 |
+
date = 0
|
| 353 |
+
|
| 354 |
+
# If we've been given a body, our response has a Content-Length, that
|
| 355 |
+
# Content-Length is valid then we can check to see if the body we've
|
| 356 |
+
# been given matches the expected size, and if it doesn't we'll just
|
| 357 |
+
# skip trying to cache it.
|
| 358 |
+
if (
|
| 359 |
+
body is not None
|
| 360 |
+
and "content-length" in response_headers
|
| 361 |
+
and response_headers["content-length"].isdigit()
|
| 362 |
+
and int(response_headers["content-length"]) != len(body)
|
| 363 |
+
):
|
| 364 |
+
return
|
| 365 |
+
|
| 366 |
+
cc_req = self.parse_cache_control(request.headers)
|
| 367 |
+
cc = self.parse_cache_control(response_headers)
|
| 368 |
+
|
| 369 |
+
assert request.url is not None
|
| 370 |
+
cache_url = self.cache_url(request.url)
|
| 371 |
+
logger.debug('Updating cache with response from "%s"', cache_url)
|
| 372 |
+
|
| 373 |
+
# Delete it from the cache if we happen to have it stored there
|
| 374 |
+
no_store = False
|
| 375 |
+
if "no-store" in cc:
|
| 376 |
+
no_store = True
|
| 377 |
+
logger.debug('Response header has "no-store"')
|
| 378 |
+
if "no-store" in cc_req:
|
| 379 |
+
no_store = True
|
| 380 |
+
logger.debug('Request header has "no-store"')
|
| 381 |
+
if no_store and self.cache.get(cache_url):
|
| 382 |
+
logger.debug('Purging existing cache entry to honor "no-store"')
|
| 383 |
+
self.cache.delete(cache_url)
|
| 384 |
+
if no_store:
|
| 385 |
+
return
|
| 386 |
+
|
| 387 |
+
# https://tools.ietf.org/html/rfc7234#section-4.1:
|
| 388 |
+
# A Vary header field-value of "*" always fails to match.
|
| 389 |
+
# Storing such a response leads to a deserialization warning
|
| 390 |
+
# during cache lookup and is not allowed to ever be served,
|
| 391 |
+
# so storing it can be avoided.
|
| 392 |
+
if "*" in response_headers.get("vary", ""):
|
| 393 |
+
logger.debug('Response header has "Vary: *"')
|
| 394 |
+
return
|
| 395 |
+
|
| 396 |
+
# If we've been given an etag, then keep the response
|
| 397 |
+
if self.cache_etags and "etag" in response_headers:
|
| 398 |
+
expires_time = 0
|
| 399 |
+
if response_headers.get("expires"):
|
| 400 |
+
expires = parsedate_tz(response_headers["expires"])
|
| 401 |
+
if expires is not None:
|
| 402 |
+
expires_time = calendar.timegm(expires[:6]) - date
|
| 403 |
+
|
| 404 |
+
expires_time = max(expires_time, 14 * 86400)
|
| 405 |
+
|
| 406 |
+
logger.debug(f"etag object cached for {expires_time} seconds")
|
| 407 |
+
logger.debug("Caching due to etag")
|
| 408 |
+
self._cache_set(cache_url, request, response, body, expires_time)
|
| 409 |
+
|
| 410 |
+
# Add to the cache any permanent redirects. We do this before looking
|
| 411 |
+
# that the Date headers.
|
| 412 |
+
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
|
| 413 |
+
logger.debug("Caching permanent redirect")
|
| 414 |
+
self._cache_set(cache_url, request, response, b"")
|
| 415 |
+
|
| 416 |
+
# Add to the cache if the response headers demand it. If there
|
| 417 |
+
# is no date header then we can't do anything about expiring
|
| 418 |
+
# the cache.
|
| 419 |
+
elif "date" in response_headers:
|
| 420 |
+
time_tuple = parsedate_tz(response_headers["date"])
|
| 421 |
+
assert time_tuple is not None
|
| 422 |
+
date = calendar.timegm(time_tuple[:6])
|
| 423 |
+
# cache when there is a max-age > 0
|
| 424 |
+
max_age = cc.get("max-age")
|
| 425 |
+
if max_age is not None and max_age > 0:
|
| 426 |
+
logger.debug("Caching b/c date exists and max-age > 0")
|
| 427 |
+
expires_time = max_age
|
| 428 |
+
self._cache_set(
|
| 429 |
+
cache_url,
|
| 430 |
+
request,
|
| 431 |
+
response,
|
| 432 |
+
body,
|
| 433 |
+
expires_time,
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# If the request can expire, it means we should cache it
|
| 437 |
+
# in the meantime.
|
| 438 |
+
elif "expires" in response_headers:
|
| 439 |
+
if response_headers["expires"]:
|
| 440 |
+
expires = parsedate_tz(response_headers["expires"])
|
| 441 |
+
if expires is not None:
|
| 442 |
+
expires_time = calendar.timegm(expires[:6]) - date
|
| 443 |
+
else:
|
| 444 |
+
expires_time = None
|
| 445 |
+
|
| 446 |
+
logger.debug(
|
| 447 |
+
"Caching b/c of expires header. expires in {} seconds".format(
|
| 448 |
+
expires_time
|
| 449 |
+
)
|
| 450 |
+
)
|
| 451 |
+
self._cache_set(
|
| 452 |
+
cache_url,
|
| 453 |
+
request,
|
| 454 |
+
response,
|
| 455 |
+
body,
|
| 456 |
+
expires_time,
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
def update_cached_response(
|
| 460 |
+
self, request: PreparedRequest, response: HTTPResponse
|
| 461 |
+
) -> HTTPResponse:
|
| 462 |
+
"""On a 304 we will get a new set of headers that we want to
|
| 463 |
+
update our cached value with, assuming we have one.
|
| 464 |
+
|
| 465 |
+
This should only ever be called when we've sent an ETag and
|
| 466 |
+
gotten a 304 as the response.
|
| 467 |
+
"""
|
| 468 |
+
assert request.url is not None
|
| 469 |
+
cache_url = self.cache_url(request.url)
|
| 470 |
+
cached_response = self._load_from_cache(request)
|
| 471 |
+
|
| 472 |
+
if not cached_response:
|
| 473 |
+
# we didn't have a cached response
|
| 474 |
+
return response
|
| 475 |
+
|
| 476 |
+
# Lets update our headers with the headers from the new request:
|
| 477 |
+
# http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
|
| 478 |
+
#
|
| 479 |
+
# The server isn't supposed to send headers that would make
|
| 480 |
+
# the cached body invalid. But... just in case, we'll be sure
|
| 481 |
+
# to strip out ones we know that might be problmatic due to
|
| 482 |
+
# typical assumptions.
|
| 483 |
+
excluded_headers = ["content-length"]
|
| 484 |
+
|
| 485 |
+
cached_response.headers.update(
|
| 486 |
+
{
|
| 487 |
+
k: v
|
| 488 |
+
for k, v in response.headers.items()
|
| 489 |
+
if k.lower() not in excluded_headers
|
| 490 |
+
}
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
# we want a 200 b/c we have content via the cache
|
| 494 |
+
cached_response.status = 200
|
| 495 |
+
|
| 496 |
+
# update our cache
|
| 497 |
+
self._cache_set(cache_url, request, cached_response)
|
| 498 |
+
|
| 499 |
+
return cached_response
|