Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal/__init__.pyi +12 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/annotated_doc-0.0.4.dist-info/licenses/LICENSE +21 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__main__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/core.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas.h +891 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasLt.h +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_api.h +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_v2.h +478 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/nvblas.h +824 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_cupti/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +1141 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__init__.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-312.pyc +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +597 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1743 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h +452 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +95 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h +174 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h +99 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +212 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +693 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +345 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h +189 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h +136 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +160 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +424 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +320 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h +281 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h +63 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal/__init__.pyi
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Generic, TypeVar
|
| 2 |
+
|
| 3 |
+
from frozenlist import FrozenList
|
| 4 |
+
|
| 5 |
+
__all__ = ("Signal",)
|
| 6 |
+
|
| 7 |
+
_T = TypeVar("_T")
|
| 8 |
+
|
| 9 |
+
class Signal(FrozenList[_T], Generic[_T]):
|
| 10 |
+
def __init__(self, owner: Any) -> None: ...
|
| 11 |
+
def __repr__(self) -> str: ...
|
| 12 |
+
async def send(self, *args: Any, **kwargs: Any) -> None: ...
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/annotated_doc-0.0.4.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The MIT License (MIT)
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Sebastián Ramírez
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in
|
| 13 |
+
all copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 21 |
+
THE SOFTWARE.
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (340 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__main__.cpython-312.pyc
ADDED
|
Binary file (655 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/core.cpython-312.pyc
ADDED
|
Binary file (3.23 kB). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (212 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (219 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (227 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas.h
ADDED
|
@@ -0,0 +1,891 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* This is the public header file for the CUBLAS library, defining the API
|
| 52 |
+
*
|
| 53 |
+
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
|
| 54 |
+
* on top of the CUDA runtime.
|
| 55 |
+
*/
|
| 56 |
+
|
| 57 |
+
#if !defined(CUBLAS_H_)
|
| 58 |
+
#define CUBLAS_H_
|
| 59 |
+
|
| 60 |
+
#if defined(CUBLAS_V2_H_)
|
| 61 |
+
#error "It is an error to include both cublas.h and cublas_v2.h"
|
| 62 |
+
#endif
|
| 63 |
+
|
| 64 |
+
#include <cuda_runtime.h>
|
| 65 |
+
|
| 66 |
+
#ifndef CUBLASWINAPI
|
| 67 |
+
#ifdef _WIN32
|
| 68 |
+
#define CUBLASWINAPI __stdcall
|
| 69 |
+
#else
|
| 70 |
+
#define CUBLASWINAPI
|
| 71 |
+
#endif
|
| 72 |
+
#endif
|
| 73 |
+
|
| 74 |
+
#undef CUBLASAPI
|
| 75 |
+
#ifdef __CUDACC__
|
| 76 |
+
#define CUBLASAPI __host__
|
| 77 |
+
#else
|
| 78 |
+
#define CUBLASAPI
|
| 79 |
+
#endif
|
| 80 |
+
|
| 81 |
+
#include "cublas_api.h"
|
| 82 |
+
|
| 83 |
+
#if defined(__cplusplus)
|
| 84 |
+
extern "C" {
|
| 85 |
+
#endif
|
| 86 |
+
|
| 87 |
+
/* CUBLAS data types */
|
| 88 |
+
#define cublasStatus cublasStatus_t
|
| 89 |
+
|
| 90 |
+
cublasStatus CUBLASWINAPI cublasInit(void);
|
| 91 |
+
cublasStatus CUBLASWINAPI cublasShutdown(void);
|
| 92 |
+
cublasStatus CUBLASWINAPI cublasGetError(void);
|
| 93 |
+
|
| 94 |
+
cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
|
| 95 |
+
cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
|
| 96 |
+
|
| 97 |
+
cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
|
| 98 |
+
|
| 99 |
+
cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
|
| 100 |
+
|
| 101 |
+
/* ---------------- CUBLAS BLAS1 functions ---------------- */
|
| 102 |
+
/* NRM2 */
|
| 103 |
+
float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
|
| 104 |
+
double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
|
| 105 |
+
float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
|
| 106 |
+
double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
|
| 107 |
+
/*------------------------------------------------------------------------*/
|
| 108 |
+
/* DOT */
|
| 109 |
+
float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
|
| 110 |
+
double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
|
| 111 |
+
cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
|
| 112 |
+
cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
|
| 113 |
+
cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
|
| 114 |
+
cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
|
| 115 |
+
/*------------------------------------------------------------------------*/
|
| 116 |
+
/* SCAL */
|
| 117 |
+
void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
|
| 118 |
+
void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
|
| 119 |
+
void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
|
| 120 |
+
void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
|
| 121 |
+
|
| 122 |
+
void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
|
| 123 |
+
void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
|
| 124 |
+
/*------------------------------------------------------------------------*/
|
| 125 |
+
/* AXPY */
|
| 126 |
+
void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
|
| 127 |
+
void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
|
| 128 |
+
void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
|
| 129 |
+
void CUBLASWINAPI
|
| 130 |
+
cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 131 |
+
/*------------------------------------------------------------------------*/
|
| 132 |
+
/* COPY */
|
| 133 |
+
void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
|
| 134 |
+
void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
|
| 135 |
+
void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
|
| 136 |
+
void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 137 |
+
/*------------------------------------------------------------------------*/
|
| 138 |
+
/* SWAP */
|
| 139 |
+
void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
|
| 140 |
+
void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
|
| 141 |
+
void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
|
| 142 |
+
void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
|
| 143 |
+
/*------------------------------------------------------------------------*/
|
| 144 |
+
/* AMAX */
|
| 145 |
+
int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
|
| 146 |
+
int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
|
| 147 |
+
int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
|
| 148 |
+
int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
|
| 149 |
+
/*------------------------------------------------------------------------*/
|
| 150 |
+
/* AMIN */
|
| 151 |
+
int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
|
| 152 |
+
int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
|
| 153 |
+
|
| 154 |
+
int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
|
| 155 |
+
int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
|
| 156 |
+
/*------------------------------------------------------------------------*/
|
| 157 |
+
/* ASUM */
|
| 158 |
+
float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
|
| 159 |
+
double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
|
| 160 |
+
float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
|
| 161 |
+
double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
|
| 162 |
+
/*------------------------------------------------------------------------*/
|
| 163 |
+
/* ROT */
|
| 164 |
+
void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
|
| 165 |
+
void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
|
| 166 |
+
void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
|
| 167 |
+
void CUBLASWINAPI
|
| 168 |
+
cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
|
| 169 |
+
void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
|
| 170 |
+
void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
|
| 171 |
+
/*------------------------------------------------------------------------*/
|
| 172 |
+
/* ROTG */
|
| 173 |
+
void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
|
| 174 |
+
void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
|
| 175 |
+
void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
|
| 176 |
+
void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
|
| 177 |
+
/*------------------------------------------------------------------------*/
|
| 178 |
+
/* ROTM */
|
| 179 |
+
void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
|
| 180 |
+
void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
|
| 181 |
+
/*------------------------------------------------------------------------*/
|
| 182 |
+
/* ROTMG */
|
| 183 |
+
void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
|
| 184 |
+
void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
|
| 185 |
+
|
| 186 |
+
/* --------------- CUBLAS BLAS2 functions ---------------- */
|
| 187 |
+
/* GEMV */
|
| 188 |
+
void CUBLASWINAPI cublasSgemv(char trans,
|
| 189 |
+
int m,
|
| 190 |
+
int n,
|
| 191 |
+
float alpha,
|
| 192 |
+
const float* A,
|
| 193 |
+
int lda,
|
| 194 |
+
const float* x,
|
| 195 |
+
int incx,
|
| 196 |
+
float beta,
|
| 197 |
+
float* y,
|
| 198 |
+
int incy);
|
| 199 |
+
void CUBLASWINAPI cublasDgemv(char trans,
|
| 200 |
+
int m,
|
| 201 |
+
int n,
|
| 202 |
+
double alpha,
|
| 203 |
+
const double* A,
|
| 204 |
+
int lda,
|
| 205 |
+
const double* x,
|
| 206 |
+
int incx,
|
| 207 |
+
double beta,
|
| 208 |
+
double* y,
|
| 209 |
+
int incy);
|
| 210 |
+
void CUBLASWINAPI cublasCgemv(char trans,
|
| 211 |
+
int m,
|
| 212 |
+
int n,
|
| 213 |
+
cuComplex alpha,
|
| 214 |
+
const cuComplex* A,
|
| 215 |
+
int lda,
|
| 216 |
+
const cuComplex* x,
|
| 217 |
+
int incx,
|
| 218 |
+
cuComplex beta,
|
| 219 |
+
cuComplex* y,
|
| 220 |
+
int incy);
|
| 221 |
+
void CUBLASWINAPI cublasZgemv(char trans,
|
| 222 |
+
int m,
|
| 223 |
+
int n,
|
| 224 |
+
cuDoubleComplex alpha,
|
| 225 |
+
const cuDoubleComplex* A,
|
| 226 |
+
int lda,
|
| 227 |
+
const cuDoubleComplex* x,
|
| 228 |
+
int incx,
|
| 229 |
+
cuDoubleComplex beta,
|
| 230 |
+
cuDoubleComplex* y,
|
| 231 |
+
int incy);
|
| 232 |
+
/*------------------------------------------------------------------------*/
|
| 233 |
+
/* GBMV */
|
| 234 |
+
void CUBLASWINAPI cublasSgbmv(char trans,
|
| 235 |
+
int m,
|
| 236 |
+
int n,
|
| 237 |
+
int kl,
|
| 238 |
+
int ku,
|
| 239 |
+
float alpha,
|
| 240 |
+
const float* A,
|
| 241 |
+
int lda,
|
| 242 |
+
const float* x,
|
| 243 |
+
int incx,
|
| 244 |
+
float beta,
|
| 245 |
+
float* y,
|
| 246 |
+
int incy);
|
| 247 |
+
void CUBLASWINAPI cublasDgbmv(char trans,
|
| 248 |
+
int m,
|
| 249 |
+
int n,
|
| 250 |
+
int kl,
|
| 251 |
+
int ku,
|
| 252 |
+
double alpha,
|
| 253 |
+
const double* A,
|
| 254 |
+
int lda,
|
| 255 |
+
const double* x,
|
| 256 |
+
int incx,
|
| 257 |
+
double beta,
|
| 258 |
+
double* y,
|
| 259 |
+
int incy);
|
| 260 |
+
void CUBLASWINAPI cublasCgbmv(char trans,
|
| 261 |
+
int m,
|
| 262 |
+
int n,
|
| 263 |
+
int kl,
|
| 264 |
+
int ku,
|
| 265 |
+
cuComplex alpha,
|
| 266 |
+
const cuComplex* A,
|
| 267 |
+
int lda,
|
| 268 |
+
const cuComplex* x,
|
| 269 |
+
int incx,
|
| 270 |
+
cuComplex beta,
|
| 271 |
+
cuComplex* y,
|
| 272 |
+
int incy);
|
| 273 |
+
void CUBLASWINAPI cublasZgbmv(char trans,
|
| 274 |
+
int m,
|
| 275 |
+
int n,
|
| 276 |
+
int kl,
|
| 277 |
+
int ku,
|
| 278 |
+
cuDoubleComplex alpha,
|
| 279 |
+
const cuDoubleComplex* A,
|
| 280 |
+
int lda,
|
| 281 |
+
const cuDoubleComplex* x,
|
| 282 |
+
int incx,
|
| 283 |
+
cuDoubleComplex beta,
|
| 284 |
+
cuDoubleComplex* y,
|
| 285 |
+
int incy);
|
| 286 |
+
/*------------------------------------------------------------------------*/
|
| 287 |
+
/* TRMV */
|
| 288 |
+
void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
|
| 289 |
+
void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
|
| 290 |
+
void CUBLASWINAPI
|
| 291 |
+
cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 292 |
+
void CUBLASWINAPI
|
| 293 |
+
cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 294 |
+
/*------------------------------------------------------------------------*/
|
| 295 |
+
/* TBMV */
|
| 296 |
+
void CUBLASWINAPI
|
| 297 |
+
cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
|
| 298 |
+
void CUBLASWINAPI
|
| 299 |
+
cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
|
| 300 |
+
void CUBLASWINAPI
|
| 301 |
+
cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 302 |
+
void CUBLASWINAPI cublasZtbmv(
|
| 303 |
+
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 304 |
+
/*------------------------------------------------------------------------*/
|
| 305 |
+
/* TPMV */
|
| 306 |
+
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
|
| 307 |
+
|
| 308 |
+
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
|
| 309 |
+
|
| 310 |
+
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
|
| 311 |
+
|
| 312 |
+
void CUBLASWINAPI
|
| 313 |
+
cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
|
| 314 |
+
/*------------------------------------------------------------------------*/
|
| 315 |
+
/* TRSV */
|
| 316 |
+
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
|
| 317 |
+
|
| 318 |
+
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
|
| 319 |
+
|
| 320 |
+
void CUBLASWINAPI
|
| 321 |
+
cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 322 |
+
|
| 323 |
+
void CUBLASWINAPI
|
| 324 |
+
cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 325 |
+
/*------------------------------------------------------------------------*/
|
| 326 |
+
/* TPSV */
|
| 327 |
+
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
|
| 328 |
+
|
| 329 |
+
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
|
| 330 |
+
|
| 331 |
+
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
|
| 332 |
+
|
| 333 |
+
void CUBLASWINAPI
|
| 334 |
+
cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
|
| 335 |
+
/*------------------------------------------------------------------------*/
|
| 336 |
+
/* TBSV */
|
| 337 |
+
void CUBLASWINAPI
|
| 338 |
+
cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
|
| 339 |
+
|
| 340 |
+
void CUBLASWINAPI
|
| 341 |
+
cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
|
| 342 |
+
void CUBLASWINAPI
|
| 343 |
+
cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
|
| 344 |
+
|
| 345 |
+
void CUBLASWINAPI cublasZtbsv(
|
| 346 |
+
char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
|
| 347 |
+
/*------------------------------------------------------------------------*/
|
| 348 |
+
/* SYMV/HEMV */
|
| 349 |
+
void CUBLASWINAPI cublasSsymv(
|
| 350 |
+
char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
|
| 351 |
+
void CUBLASWINAPI cublasDsymv(char uplo,
|
| 352 |
+
int n,
|
| 353 |
+
double alpha,
|
| 354 |
+
const double* A,
|
| 355 |
+
int lda,
|
| 356 |
+
const double* x,
|
| 357 |
+
int incx,
|
| 358 |
+
double beta,
|
| 359 |
+
double* y,
|
| 360 |
+
int incy);
|
| 361 |
+
void CUBLASWINAPI cublasChemv(char uplo,
|
| 362 |
+
int n,
|
| 363 |
+
cuComplex alpha,
|
| 364 |
+
const cuComplex* A,
|
| 365 |
+
int lda,
|
| 366 |
+
const cuComplex* x,
|
| 367 |
+
int incx,
|
| 368 |
+
cuComplex beta,
|
| 369 |
+
cuComplex* y,
|
| 370 |
+
int incy);
|
| 371 |
+
void CUBLASWINAPI cublasZhemv(char uplo,
|
| 372 |
+
int n,
|
| 373 |
+
cuDoubleComplex alpha,
|
| 374 |
+
const cuDoubleComplex* A,
|
| 375 |
+
int lda,
|
| 376 |
+
const cuDoubleComplex* x,
|
| 377 |
+
int incx,
|
| 378 |
+
cuDoubleComplex beta,
|
| 379 |
+
cuDoubleComplex* y,
|
| 380 |
+
int incy);
|
| 381 |
+
/*------------------------------------------------------------------------*/
|
| 382 |
+
/* SBMV/HBMV */
|
| 383 |
+
void CUBLASWINAPI cublasSsbmv(char uplo,
|
| 384 |
+
int n,
|
| 385 |
+
int k,
|
| 386 |
+
float alpha,
|
| 387 |
+
const float* A,
|
| 388 |
+
int lda,
|
| 389 |
+
const float* x,
|
| 390 |
+
int incx,
|
| 391 |
+
float beta,
|
| 392 |
+
float* y,
|
| 393 |
+
int incy);
|
| 394 |
+
void CUBLASWINAPI cublasDsbmv(char uplo,
|
| 395 |
+
int n,
|
| 396 |
+
int k,
|
| 397 |
+
double alpha,
|
| 398 |
+
const double* A,
|
| 399 |
+
int lda,
|
| 400 |
+
const double* x,
|
| 401 |
+
int incx,
|
| 402 |
+
double beta,
|
| 403 |
+
double* y,
|
| 404 |
+
int incy);
|
| 405 |
+
void CUBLASWINAPI cublasChbmv(char uplo,
|
| 406 |
+
int n,
|
| 407 |
+
int k,
|
| 408 |
+
cuComplex alpha,
|
| 409 |
+
const cuComplex* A,
|
| 410 |
+
int lda,
|
| 411 |
+
const cuComplex* x,
|
| 412 |
+
int incx,
|
| 413 |
+
cuComplex beta,
|
| 414 |
+
cuComplex* y,
|
| 415 |
+
int incy);
|
| 416 |
+
void CUBLASWINAPI cublasZhbmv(char uplo,
|
| 417 |
+
int n,
|
| 418 |
+
int k,
|
| 419 |
+
cuDoubleComplex alpha,
|
| 420 |
+
const cuDoubleComplex* A,
|
| 421 |
+
int lda,
|
| 422 |
+
const cuDoubleComplex* x,
|
| 423 |
+
int incx,
|
| 424 |
+
cuDoubleComplex beta,
|
| 425 |
+
cuDoubleComplex* y,
|
| 426 |
+
int incy);
|
| 427 |
+
/*------------------------------------------------------------------------*/
|
| 428 |
+
/* SPMV/HPMV */
|
| 429 |
+
void CUBLASWINAPI
|
| 430 |
+
cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
|
| 431 |
+
void CUBLASWINAPI cublasDspmv(
|
| 432 |
+
char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
|
| 433 |
+
void CUBLASWINAPI cublasChpmv(char uplo,
|
| 434 |
+
int n,
|
| 435 |
+
cuComplex alpha,
|
| 436 |
+
const cuComplex* AP,
|
| 437 |
+
const cuComplex* x,
|
| 438 |
+
int incx,
|
| 439 |
+
cuComplex beta,
|
| 440 |
+
cuComplex* y,
|
| 441 |
+
int incy);
|
| 442 |
+
void CUBLASWINAPI cublasZhpmv(char uplo,
|
| 443 |
+
int n,
|
| 444 |
+
cuDoubleComplex alpha,
|
| 445 |
+
const cuDoubleComplex* AP,
|
| 446 |
+
const cuDoubleComplex* x,
|
| 447 |
+
int incx,
|
| 448 |
+
cuDoubleComplex beta,
|
| 449 |
+
cuDoubleComplex* y,
|
| 450 |
+
int incy);
|
| 451 |
+
|
| 452 |
+
/*------------------------------------------------------------------------*/
|
| 453 |
+
/* GER */
|
| 454 |
+
void CUBLASWINAPI
|
| 455 |
+
cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
|
| 456 |
+
void CUBLASWINAPI
|
| 457 |
+
cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
|
| 458 |
+
|
| 459 |
+
void CUBLASWINAPI cublasCgeru(
|
| 460 |
+
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
|
| 461 |
+
void CUBLASWINAPI cublasCgerc(
|
| 462 |
+
int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
|
| 463 |
+
void CUBLASWINAPI cublasZgeru(int m,
|
| 464 |
+
int n,
|
| 465 |
+
cuDoubleComplex alpha,
|
| 466 |
+
const cuDoubleComplex* x,
|
| 467 |
+
int incx,
|
| 468 |
+
const cuDoubleComplex* y,
|
| 469 |
+
int incy,
|
| 470 |
+
cuDoubleComplex* A,
|
| 471 |
+
int lda);
|
| 472 |
+
void CUBLASWINAPI cublasZgerc(int m,
|
| 473 |
+
int n,
|
| 474 |
+
cuDoubleComplex alpha,
|
| 475 |
+
const cuDoubleComplex* x,
|
| 476 |
+
int incx,
|
| 477 |
+
const cuDoubleComplex* y,
|
| 478 |
+
int incy,
|
| 479 |
+
cuDoubleComplex* A,
|
| 480 |
+
int lda);
|
| 481 |
+
/*------------------------------------------------------------------------*/
|
| 482 |
+
/* SYR/HER */
|
| 483 |
+
void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
|
| 484 |
+
void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
|
| 485 |
+
|
| 486 |
+
void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
|
| 487 |
+
void CUBLASWINAPI
|
| 488 |
+
cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
|
| 489 |
+
|
| 490 |
+
/*------------------------------------------------------------------------*/
|
| 491 |
+
/* SPR/HPR */
|
| 492 |
+
void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
|
| 493 |
+
void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
|
| 494 |
+
void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
|
| 495 |
+
void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
|
| 496 |
+
/*------------------------------------------------------------------------*/
|
| 497 |
+
/* SYR2/HER2 */
|
| 498 |
+
void CUBLASWINAPI
|
| 499 |
+
cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
|
| 500 |
+
void CUBLASWINAPI
|
| 501 |
+
cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
|
| 502 |
+
void CUBLASWINAPI cublasCher2(char uplo,
|
| 503 |
+
int n,
|
| 504 |
+
cuComplex alpha,
|
| 505 |
+
const cuComplex* x,
|
| 506 |
+
int incx,
|
| 507 |
+
const cuComplex* y,
|
| 508 |
+
int incy,
|
| 509 |
+
cuComplex* A,
|
| 510 |
+
int lda);
|
| 511 |
+
void CUBLASWINAPI cublasZher2(char uplo,
|
| 512 |
+
int n,
|
| 513 |
+
cuDoubleComplex alpha,
|
| 514 |
+
const cuDoubleComplex* x,
|
| 515 |
+
int incx,
|
| 516 |
+
const cuDoubleComplex* y,
|
| 517 |
+
int incy,
|
| 518 |
+
cuDoubleComplex* A,
|
| 519 |
+
int lda);
|
| 520 |
+
|
| 521 |
+
/*------------------------------------------------------------------------*/
|
| 522 |
+
/* SPR2/HPR2 */
|
| 523 |
+
void CUBLASWINAPI
|
| 524 |
+
cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
|
| 525 |
+
void CUBLASWINAPI
|
| 526 |
+
cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
|
| 527 |
+
void CUBLASWINAPI cublasChpr2(
|
| 528 |
+
char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
|
| 529 |
+
void CUBLASWINAPI cublasZhpr2(char uplo,
|
| 530 |
+
int n,
|
| 531 |
+
cuDoubleComplex alpha,
|
| 532 |
+
const cuDoubleComplex* x,
|
| 533 |
+
int incx,
|
| 534 |
+
const cuDoubleComplex* y,
|
| 535 |
+
int incy,
|
| 536 |
+
cuDoubleComplex* AP);
|
| 537 |
+
/* ------------------------BLAS3 Functions ------------------------------- */
|
| 538 |
+
/* GEMM */
|
| 539 |
+
void CUBLASWINAPI cublasSgemm(char transa,
|
| 540 |
+
char transb,
|
| 541 |
+
int m,
|
| 542 |
+
int n,
|
| 543 |
+
int k,
|
| 544 |
+
float alpha,
|
| 545 |
+
const float* A,
|
| 546 |
+
int lda,
|
| 547 |
+
const float* B,
|
| 548 |
+
int ldb,
|
| 549 |
+
float beta,
|
| 550 |
+
float* C,
|
| 551 |
+
int ldc);
|
| 552 |
+
void CUBLASWINAPI cublasDgemm(char transa,
|
| 553 |
+
char transb,
|
| 554 |
+
int m,
|
| 555 |
+
int n,
|
| 556 |
+
int k,
|
| 557 |
+
double alpha,
|
| 558 |
+
const double* A,
|
| 559 |
+
int lda,
|
| 560 |
+
const double* B,
|
| 561 |
+
int ldb,
|
| 562 |
+
double beta,
|
| 563 |
+
double* C,
|
| 564 |
+
int ldc);
|
| 565 |
+
void CUBLASWINAPI cublasCgemm(char transa,
|
| 566 |
+
char transb,
|
| 567 |
+
int m,
|
| 568 |
+
int n,
|
| 569 |
+
int k,
|
| 570 |
+
cuComplex alpha,
|
| 571 |
+
const cuComplex* A,
|
| 572 |
+
int lda,
|
| 573 |
+
const cuComplex* B,
|
| 574 |
+
int ldb,
|
| 575 |
+
cuComplex beta,
|
| 576 |
+
cuComplex* C,
|
| 577 |
+
int ldc);
|
| 578 |
+
void CUBLASWINAPI cublasZgemm(char transa,
|
| 579 |
+
char transb,
|
| 580 |
+
int m,
|
| 581 |
+
int n,
|
| 582 |
+
int k,
|
| 583 |
+
cuDoubleComplex alpha,
|
| 584 |
+
const cuDoubleComplex* A,
|
| 585 |
+
int lda,
|
| 586 |
+
const cuDoubleComplex* B,
|
| 587 |
+
int ldb,
|
| 588 |
+
cuDoubleComplex beta,
|
| 589 |
+
cuDoubleComplex* C,
|
| 590 |
+
int ldc);
|
| 591 |
+
/* -------------------------------------------------------*/
|
| 592 |
+
/* SYRK */
|
| 593 |
+
void CUBLASWINAPI
|
| 594 |
+
cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
|
| 595 |
+
void CUBLASWINAPI cublasDsyrk(
|
| 596 |
+
char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
|
| 597 |
+
|
| 598 |
+
void CUBLASWINAPI cublasCsyrk(char uplo,
|
| 599 |
+
char trans,
|
| 600 |
+
int n,
|
| 601 |
+
int k,
|
| 602 |
+
cuComplex alpha,
|
| 603 |
+
const cuComplex* A,
|
| 604 |
+
int lda,
|
| 605 |
+
cuComplex beta,
|
| 606 |
+
cuComplex* C,
|
| 607 |
+
int ldc);
|
| 608 |
+
void CUBLASWINAPI cublasZsyrk(char uplo,
|
| 609 |
+
char trans,
|
| 610 |
+
int n,
|
| 611 |
+
int k,
|
| 612 |
+
cuDoubleComplex alpha,
|
| 613 |
+
const cuDoubleComplex* A,
|
| 614 |
+
int lda,
|
| 615 |
+
cuDoubleComplex beta,
|
| 616 |
+
cuDoubleComplex* C,
|
| 617 |
+
int ldc);
|
| 618 |
+
/* ------------------------------------------------------- */
|
| 619 |
+
/* HERK */
|
| 620 |
+
void CUBLASWINAPI cublasCherk(
|
| 621 |
+
char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
|
| 622 |
+
void CUBLASWINAPI cublasZherk(char uplo,
|
| 623 |
+
char trans,
|
| 624 |
+
int n,
|
| 625 |
+
int k,
|
| 626 |
+
double alpha,
|
| 627 |
+
const cuDoubleComplex* A,
|
| 628 |
+
int lda,
|
| 629 |
+
double beta,
|
| 630 |
+
cuDoubleComplex* C,
|
| 631 |
+
int ldc);
|
| 632 |
+
/* ------------------------------------------------------- */
|
| 633 |
+
/* SYR2K */
|
| 634 |
+
void CUBLASWINAPI cublasSsyr2k(char uplo,
|
| 635 |
+
char trans,
|
| 636 |
+
int n,
|
| 637 |
+
int k,
|
| 638 |
+
float alpha,
|
| 639 |
+
const float* A,
|
| 640 |
+
int lda,
|
| 641 |
+
const float* B,
|
| 642 |
+
int ldb,
|
| 643 |
+
float beta,
|
| 644 |
+
float* C,
|
| 645 |
+
int ldc);
|
| 646 |
+
|
| 647 |
+
void CUBLASWINAPI cublasDsyr2k(char uplo,
|
| 648 |
+
char trans,
|
| 649 |
+
int n,
|
| 650 |
+
int k,
|
| 651 |
+
double alpha,
|
| 652 |
+
const double* A,
|
| 653 |
+
int lda,
|
| 654 |
+
const double* B,
|
| 655 |
+
int ldb,
|
| 656 |
+
double beta,
|
| 657 |
+
double* C,
|
| 658 |
+
int ldc);
|
| 659 |
+
void CUBLASWINAPI cublasCsyr2k(char uplo,
|
| 660 |
+
char trans,
|
| 661 |
+
int n,
|
| 662 |
+
int k,
|
| 663 |
+
cuComplex alpha,
|
| 664 |
+
const cuComplex* A,
|
| 665 |
+
int lda,
|
| 666 |
+
const cuComplex* B,
|
| 667 |
+
int ldb,
|
| 668 |
+
cuComplex beta,
|
| 669 |
+
cuComplex* C,
|
| 670 |
+
int ldc);
|
| 671 |
+
|
| 672 |
+
void CUBLASWINAPI cublasZsyr2k(char uplo,
|
| 673 |
+
char trans,
|
| 674 |
+
int n,
|
| 675 |
+
int k,
|
| 676 |
+
cuDoubleComplex alpha,
|
| 677 |
+
const cuDoubleComplex* A,
|
| 678 |
+
int lda,
|
| 679 |
+
const cuDoubleComplex* B,
|
| 680 |
+
int ldb,
|
| 681 |
+
cuDoubleComplex beta,
|
| 682 |
+
cuDoubleComplex* C,
|
| 683 |
+
int ldc);
|
| 684 |
+
/* ------------------------------------------------------- */
|
| 685 |
+
/* HER2K */
|
| 686 |
+
void CUBLASWINAPI cublasCher2k(char uplo,
|
| 687 |
+
char trans,
|
| 688 |
+
int n,
|
| 689 |
+
int k,
|
| 690 |
+
cuComplex alpha,
|
| 691 |
+
const cuComplex* A,
|
| 692 |
+
int lda,
|
| 693 |
+
const cuComplex* B,
|
| 694 |
+
int ldb,
|
| 695 |
+
float beta,
|
| 696 |
+
cuComplex* C,
|
| 697 |
+
int ldc);
|
| 698 |
+
|
| 699 |
+
void CUBLASWINAPI cublasZher2k(char uplo,
|
| 700 |
+
char trans,
|
| 701 |
+
int n,
|
| 702 |
+
int k,
|
| 703 |
+
cuDoubleComplex alpha,
|
| 704 |
+
const cuDoubleComplex* A,
|
| 705 |
+
int lda,
|
| 706 |
+
const cuDoubleComplex* B,
|
| 707 |
+
int ldb,
|
| 708 |
+
double beta,
|
| 709 |
+
cuDoubleComplex* C,
|
| 710 |
+
int ldc);
|
| 711 |
+
|
| 712 |
+
/*------------------------------------------------------------------------*/
|
| 713 |
+
/* SYMM*/
|
| 714 |
+
void CUBLASWINAPI cublasSsymm(char side,
|
| 715 |
+
char uplo,
|
| 716 |
+
int m,
|
| 717 |
+
int n,
|
| 718 |
+
float alpha,
|
| 719 |
+
const float* A,
|
| 720 |
+
int lda,
|
| 721 |
+
const float* B,
|
| 722 |
+
int ldb,
|
| 723 |
+
float beta,
|
| 724 |
+
float* C,
|
| 725 |
+
int ldc);
|
| 726 |
+
void CUBLASWINAPI cublasDsymm(char side,
|
| 727 |
+
char uplo,
|
| 728 |
+
int m,
|
| 729 |
+
int n,
|
| 730 |
+
double alpha,
|
| 731 |
+
const double* A,
|
| 732 |
+
int lda,
|
| 733 |
+
const double* B,
|
| 734 |
+
int ldb,
|
| 735 |
+
double beta,
|
| 736 |
+
double* C,
|
| 737 |
+
int ldc);
|
| 738 |
+
|
| 739 |
+
void CUBLASWINAPI cublasCsymm(char side,
|
| 740 |
+
char uplo,
|
| 741 |
+
int m,
|
| 742 |
+
int n,
|
| 743 |
+
cuComplex alpha,
|
| 744 |
+
const cuComplex* A,
|
| 745 |
+
int lda,
|
| 746 |
+
const cuComplex* B,
|
| 747 |
+
int ldb,
|
| 748 |
+
cuComplex beta,
|
| 749 |
+
cuComplex* C,
|
| 750 |
+
int ldc);
|
| 751 |
+
|
| 752 |
+
void CUBLASWINAPI cublasZsymm(char side,
|
| 753 |
+
char uplo,
|
| 754 |
+
int m,
|
| 755 |
+
int n,
|
| 756 |
+
cuDoubleComplex alpha,
|
| 757 |
+
const cuDoubleComplex* A,
|
| 758 |
+
int lda,
|
| 759 |
+
const cuDoubleComplex* B,
|
| 760 |
+
int ldb,
|
| 761 |
+
cuDoubleComplex beta,
|
| 762 |
+
cuDoubleComplex* C,
|
| 763 |
+
int ldc);
|
| 764 |
+
/*------------------------------------------------------------------------*/
|
| 765 |
+
/* HEMM*/
|
| 766 |
+
void CUBLASWINAPI cublasChemm(char side,
|
| 767 |
+
char uplo,
|
| 768 |
+
int m,
|
| 769 |
+
int n,
|
| 770 |
+
cuComplex alpha,
|
| 771 |
+
const cuComplex* A,
|
| 772 |
+
int lda,
|
| 773 |
+
const cuComplex* B,
|
| 774 |
+
int ldb,
|
| 775 |
+
cuComplex beta,
|
| 776 |
+
cuComplex* C,
|
| 777 |
+
int ldc);
|
| 778 |
+
void CUBLASWINAPI cublasZhemm(char side,
|
| 779 |
+
char uplo,
|
| 780 |
+
int m,
|
| 781 |
+
int n,
|
| 782 |
+
cuDoubleComplex alpha,
|
| 783 |
+
const cuDoubleComplex* A,
|
| 784 |
+
int lda,
|
| 785 |
+
const cuDoubleComplex* B,
|
| 786 |
+
int ldb,
|
| 787 |
+
cuDoubleComplex beta,
|
| 788 |
+
cuDoubleComplex* C,
|
| 789 |
+
int ldc);
|
| 790 |
+
|
| 791 |
+
/*------------------------------------------------------------------------*/
|
| 792 |
+
/* TRSM*/
|
| 793 |
+
void CUBLASWINAPI cublasStrsm(char side,
|
| 794 |
+
char uplo,
|
| 795 |
+
char transa,
|
| 796 |
+
char diag,
|
| 797 |
+
int m,
|
| 798 |
+
int n,
|
| 799 |
+
float alpha,
|
| 800 |
+
const float* A,
|
| 801 |
+
int lda,
|
| 802 |
+
float* B,
|
| 803 |
+
int ldb);
|
| 804 |
+
|
| 805 |
+
void CUBLASWINAPI cublasDtrsm(char side,
|
| 806 |
+
char uplo,
|
| 807 |
+
char transa,
|
| 808 |
+
char diag,
|
| 809 |
+
int m,
|
| 810 |
+
int n,
|
| 811 |
+
double alpha,
|
| 812 |
+
const double* A,
|
| 813 |
+
int lda,
|
| 814 |
+
double* B,
|
| 815 |
+
int ldb);
|
| 816 |
+
|
| 817 |
+
void CUBLASWINAPI cublasCtrsm(char side,
|
| 818 |
+
char uplo,
|
| 819 |
+
char transa,
|
| 820 |
+
char diag,
|
| 821 |
+
int m,
|
| 822 |
+
int n,
|
| 823 |
+
cuComplex alpha,
|
| 824 |
+
const cuComplex* A,
|
| 825 |
+
int lda,
|
| 826 |
+
cuComplex* B,
|
| 827 |
+
int ldb);
|
| 828 |
+
|
| 829 |
+
void CUBLASWINAPI cublasZtrsm(char side,
|
| 830 |
+
char uplo,
|
| 831 |
+
char transa,
|
| 832 |
+
char diag,
|
| 833 |
+
int m,
|
| 834 |
+
int n,
|
| 835 |
+
cuDoubleComplex alpha,
|
| 836 |
+
const cuDoubleComplex* A,
|
| 837 |
+
int lda,
|
| 838 |
+
cuDoubleComplex* B,
|
| 839 |
+
int ldb);
|
| 840 |
+
/*------------------------------------------------------------------------*/
|
| 841 |
+
/* TRMM*/
|
| 842 |
+
void CUBLASWINAPI cublasStrmm(char side,
|
| 843 |
+
char uplo,
|
| 844 |
+
char transa,
|
| 845 |
+
char diag,
|
| 846 |
+
int m,
|
| 847 |
+
int n,
|
| 848 |
+
float alpha,
|
| 849 |
+
const float* A,
|
| 850 |
+
int lda,
|
| 851 |
+
float* B,
|
| 852 |
+
int ldb);
|
| 853 |
+
void CUBLASWINAPI cublasDtrmm(char side,
|
| 854 |
+
char uplo,
|
| 855 |
+
char transa,
|
| 856 |
+
char diag,
|
| 857 |
+
int m,
|
| 858 |
+
int n,
|
| 859 |
+
double alpha,
|
| 860 |
+
const double* A,
|
| 861 |
+
int lda,
|
| 862 |
+
double* B,
|
| 863 |
+
int ldb);
|
| 864 |
+
void CUBLASWINAPI cublasCtrmm(char side,
|
| 865 |
+
char uplo,
|
| 866 |
+
char transa,
|
| 867 |
+
char diag,
|
| 868 |
+
int m,
|
| 869 |
+
int n,
|
| 870 |
+
cuComplex alpha,
|
| 871 |
+
const cuComplex* A,
|
| 872 |
+
int lda,
|
| 873 |
+
cuComplex* B,
|
| 874 |
+
int ldb);
|
| 875 |
+
void CUBLASWINAPI cublasZtrmm(char side,
|
| 876 |
+
char uplo,
|
| 877 |
+
char transa,
|
| 878 |
+
char diag,
|
| 879 |
+
int m,
|
| 880 |
+
int n,
|
| 881 |
+
cuDoubleComplex alpha,
|
| 882 |
+
const cuDoubleComplex* A,
|
| 883 |
+
int lda,
|
| 884 |
+
cuDoubleComplex* B,
|
| 885 |
+
int ldb);
|
| 886 |
+
|
| 887 |
+
#if defined(__cplusplus)
|
| 888 |
+
}
|
| 889 |
+
#endif /* __cplusplus */
|
| 890 |
+
|
| 891 |
+
#endif /* !defined(CUBLAS_H_) */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasLt.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasXt.h
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
|
| 51 |
+
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUBLAS_XT_H_)
|
| 55 |
+
#define CUBLAS_XT_H_
|
| 56 |
+
|
| 57 |
+
#include "driver_types.h"
|
| 58 |
+
#include "cuComplex.h" /* import complex data type */
|
| 59 |
+
|
| 60 |
+
#include "cublas_v2.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__cplusplus)
|
| 63 |
+
extern "C" {
|
| 64 |
+
#endif /* __cplusplus */
|
| 65 |
+
|
| 66 |
+
struct cublasXtContext;
|
| 67 |
+
typedef struct cublasXtContext* cublasXtHandle_t;
|
| 68 |
+
|
| 69 |
+
cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
|
| 70 |
+
cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
|
| 71 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
|
| 72 |
+
cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
|
| 73 |
+
/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
|
| 74 |
+
cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
|
| 75 |
+
|
| 76 |
+
/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
|
| 77 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
|
| 78 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
|
| 79 |
+
|
| 80 |
+
typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
|
| 81 |
+
/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
|
| 82 |
+
are not pinned : Pinning/Unpinning the Host memory is still a costly operation
|
| 83 |
+
It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
|
| 84 |
+
*/
|
| 85 |
+
cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
|
| 86 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
|
| 87 |
+
|
| 88 |
+
/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
|
| 89 |
+
typedef enum {
|
| 90 |
+
CUBLASXT_FLOAT = 0,
|
| 91 |
+
CUBLASXT_DOUBLE = 1,
|
| 92 |
+
CUBLASXT_COMPLEX = 2,
|
| 93 |
+
CUBLASXT_DOUBLECOMPLEX = 3,
|
| 94 |
+
} cublasXtOpType_t;
|
| 95 |
+
|
| 96 |
+
typedef enum {
|
| 97 |
+
CUBLASXT_GEMM = 0,
|
| 98 |
+
CUBLASXT_SYRK = 1,
|
| 99 |
+
CUBLASXT_HERK = 2,
|
| 100 |
+
CUBLASXT_SYMM = 3,
|
| 101 |
+
CUBLASXT_HEMM = 4,
|
| 102 |
+
CUBLASXT_TRSM = 5,
|
| 103 |
+
CUBLASXT_SYR2K = 6,
|
| 104 |
+
CUBLASXT_HER2K = 7,
|
| 105 |
+
|
| 106 |
+
CUBLASXT_SPMM = 8,
|
| 107 |
+
CUBLASXT_SYRKX = 9,
|
| 108 |
+
CUBLASXT_HERKX = 10,
|
| 109 |
+
CUBLASXT_TRMM = 11,
|
| 110 |
+
CUBLASXT_ROUTINE_MAX = 12,
|
| 111 |
+
} cublasXtBlasOp_t;
|
| 112 |
+
|
| 113 |
+
/* Currently only 32-bit integer BLAS routines are supported */
|
| 114 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
|
| 115 |
+
cublasXtBlasOp_t blasOp,
|
| 116 |
+
cublasXtOpType_t type,
|
| 117 |
+
void* blasFunctor);
|
| 118 |
+
|
| 119 |
+
/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
|
| 120 |
+
cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
|
| 121 |
+
cublasXtBlasOp_t blasOp,
|
| 122 |
+
cublasXtOpType_t type,
|
| 123 |
+
float ratio);
|
| 124 |
+
|
| 125 |
+
/* GEMM */
|
| 126 |
+
cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
|
| 127 |
+
cublasOperation_t transa,
|
| 128 |
+
cublasOperation_t transb,
|
| 129 |
+
size_t m,
|
| 130 |
+
size_t n,
|
| 131 |
+
size_t k,
|
| 132 |
+
const float* alpha,
|
| 133 |
+
const float* A,
|
| 134 |
+
size_t lda,
|
| 135 |
+
const float* B,
|
| 136 |
+
size_t ldb,
|
| 137 |
+
const float* beta,
|
| 138 |
+
float* C,
|
| 139 |
+
size_t ldc);
|
| 140 |
+
|
| 141 |
+
cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
|
| 142 |
+
cublasOperation_t transa,
|
| 143 |
+
cublasOperation_t transb,
|
| 144 |
+
size_t m,
|
| 145 |
+
size_t n,
|
| 146 |
+
size_t k,
|
| 147 |
+
const double* alpha,
|
| 148 |
+
const double* A,
|
| 149 |
+
size_t lda,
|
| 150 |
+
const double* B,
|
| 151 |
+
size_t ldb,
|
| 152 |
+
const double* beta,
|
| 153 |
+
double* C,
|
| 154 |
+
size_t ldc);
|
| 155 |
+
|
| 156 |
+
cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
|
| 157 |
+
cublasOperation_t transa,
|
| 158 |
+
cublasOperation_t transb,
|
| 159 |
+
size_t m,
|
| 160 |
+
size_t n,
|
| 161 |
+
size_t k,
|
| 162 |
+
const cuComplex* alpha,
|
| 163 |
+
const cuComplex* A,
|
| 164 |
+
size_t lda,
|
| 165 |
+
const cuComplex* B,
|
| 166 |
+
size_t ldb,
|
| 167 |
+
const cuComplex* beta,
|
| 168 |
+
cuComplex* C,
|
| 169 |
+
size_t ldc);
|
| 170 |
+
|
| 171 |
+
cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
|
| 172 |
+
cublasOperation_t transa,
|
| 173 |
+
cublasOperation_t transb,
|
| 174 |
+
size_t m,
|
| 175 |
+
size_t n,
|
| 176 |
+
size_t k,
|
| 177 |
+
const cuDoubleComplex* alpha,
|
| 178 |
+
const cuDoubleComplex* A,
|
| 179 |
+
size_t lda,
|
| 180 |
+
const cuDoubleComplex* B,
|
| 181 |
+
size_t ldb,
|
| 182 |
+
const cuDoubleComplex* beta,
|
| 183 |
+
cuDoubleComplex* C,
|
| 184 |
+
size_t ldc);
|
| 185 |
+
/* ------------------------------------------------------- */
|
| 186 |
+
/* SYRK */
|
| 187 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
|
| 188 |
+
cublasFillMode_t uplo,
|
| 189 |
+
cublasOperation_t trans,
|
| 190 |
+
size_t n,
|
| 191 |
+
size_t k,
|
| 192 |
+
const float* alpha,
|
| 193 |
+
const float* A,
|
| 194 |
+
size_t lda,
|
| 195 |
+
const float* beta,
|
| 196 |
+
float* C,
|
| 197 |
+
size_t ldc);
|
| 198 |
+
|
| 199 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
|
| 200 |
+
cublasFillMode_t uplo,
|
| 201 |
+
cublasOperation_t trans,
|
| 202 |
+
size_t n,
|
| 203 |
+
size_t k,
|
| 204 |
+
const double* alpha,
|
| 205 |
+
const double* A,
|
| 206 |
+
size_t lda,
|
| 207 |
+
const double* beta,
|
| 208 |
+
double* C,
|
| 209 |
+
size_t ldc);
|
| 210 |
+
|
| 211 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
|
| 212 |
+
cublasFillMode_t uplo,
|
| 213 |
+
cublasOperation_t trans,
|
| 214 |
+
size_t n,
|
| 215 |
+
size_t k,
|
| 216 |
+
const cuComplex* alpha,
|
| 217 |
+
const cuComplex* A,
|
| 218 |
+
size_t lda,
|
| 219 |
+
const cuComplex* beta,
|
| 220 |
+
cuComplex* C,
|
| 221 |
+
size_t ldc);
|
| 222 |
+
|
| 223 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
|
| 224 |
+
cublasFillMode_t uplo,
|
| 225 |
+
cublasOperation_t trans,
|
| 226 |
+
size_t n,
|
| 227 |
+
size_t k,
|
| 228 |
+
const cuDoubleComplex* alpha,
|
| 229 |
+
const cuDoubleComplex* A,
|
| 230 |
+
size_t lda,
|
| 231 |
+
const cuDoubleComplex* beta,
|
| 232 |
+
cuDoubleComplex* C,
|
| 233 |
+
size_t ldc);
|
| 234 |
+
/* -------------------------------------------------------------------- */
|
| 235 |
+
/* HERK */
|
| 236 |
+
cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
|
| 237 |
+
cublasFillMode_t uplo,
|
| 238 |
+
cublasOperation_t trans,
|
| 239 |
+
size_t n,
|
| 240 |
+
size_t k,
|
| 241 |
+
const float* alpha,
|
| 242 |
+
const cuComplex* A,
|
| 243 |
+
size_t lda,
|
| 244 |
+
const float* beta,
|
| 245 |
+
cuComplex* C,
|
| 246 |
+
size_t ldc);
|
| 247 |
+
|
| 248 |
+
cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
|
| 249 |
+
cublasFillMode_t uplo,
|
| 250 |
+
cublasOperation_t trans,
|
| 251 |
+
size_t n,
|
| 252 |
+
size_t k,
|
| 253 |
+
const double* alpha,
|
| 254 |
+
const cuDoubleComplex* A,
|
| 255 |
+
size_t lda,
|
| 256 |
+
const double* beta,
|
| 257 |
+
cuDoubleComplex* C,
|
| 258 |
+
size_t ldc);
|
| 259 |
+
/* -------------------------------------------------------------------- */
|
| 260 |
+
/* SYR2K */
|
| 261 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
|
| 262 |
+
cublasFillMode_t uplo,
|
| 263 |
+
cublasOperation_t trans,
|
| 264 |
+
size_t n,
|
| 265 |
+
size_t k,
|
| 266 |
+
const float* alpha,
|
| 267 |
+
const float* A,
|
| 268 |
+
size_t lda,
|
| 269 |
+
const float* B,
|
| 270 |
+
size_t ldb,
|
| 271 |
+
const float* beta,
|
| 272 |
+
float* C,
|
| 273 |
+
size_t ldc);
|
| 274 |
+
|
| 275 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
|
| 276 |
+
cublasFillMode_t uplo,
|
| 277 |
+
cublasOperation_t trans,
|
| 278 |
+
size_t n,
|
| 279 |
+
size_t k,
|
| 280 |
+
const double* alpha,
|
| 281 |
+
const double* A,
|
| 282 |
+
size_t lda,
|
| 283 |
+
const double* B,
|
| 284 |
+
size_t ldb,
|
| 285 |
+
const double* beta,
|
| 286 |
+
double* C,
|
| 287 |
+
size_t ldc);
|
| 288 |
+
|
| 289 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
|
| 290 |
+
cublasFillMode_t uplo,
|
| 291 |
+
cublasOperation_t trans,
|
| 292 |
+
size_t n,
|
| 293 |
+
size_t k,
|
| 294 |
+
const cuComplex* alpha,
|
| 295 |
+
const cuComplex* A,
|
| 296 |
+
size_t lda,
|
| 297 |
+
const cuComplex* B,
|
| 298 |
+
size_t ldb,
|
| 299 |
+
const cuComplex* beta,
|
| 300 |
+
cuComplex* C,
|
| 301 |
+
size_t ldc);
|
| 302 |
+
|
| 303 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
|
| 304 |
+
cublasFillMode_t uplo,
|
| 305 |
+
cublasOperation_t trans,
|
| 306 |
+
size_t n,
|
| 307 |
+
size_t k,
|
| 308 |
+
const cuDoubleComplex* alpha,
|
| 309 |
+
const cuDoubleComplex* A,
|
| 310 |
+
size_t lda,
|
| 311 |
+
const cuDoubleComplex* B,
|
| 312 |
+
size_t ldb,
|
| 313 |
+
const cuDoubleComplex* beta,
|
| 314 |
+
cuDoubleComplex* C,
|
| 315 |
+
size_t ldc);
|
| 316 |
+
/* -------------------------------------------------------------------- */
|
| 317 |
+
/* HERKX : variant extension of HERK */
|
| 318 |
+
cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
|
| 319 |
+
cublasFillMode_t uplo,
|
| 320 |
+
cublasOperation_t trans,
|
| 321 |
+
size_t n,
|
| 322 |
+
size_t k,
|
| 323 |
+
const cuComplex* alpha,
|
| 324 |
+
const cuComplex* A,
|
| 325 |
+
size_t lda,
|
| 326 |
+
const cuComplex* B,
|
| 327 |
+
size_t ldb,
|
| 328 |
+
const float* beta,
|
| 329 |
+
cuComplex* C,
|
| 330 |
+
size_t ldc);
|
| 331 |
+
|
| 332 |
+
cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
|
| 333 |
+
cublasFillMode_t uplo,
|
| 334 |
+
cublasOperation_t trans,
|
| 335 |
+
size_t n,
|
| 336 |
+
size_t k,
|
| 337 |
+
const cuDoubleComplex* alpha,
|
| 338 |
+
const cuDoubleComplex* A,
|
| 339 |
+
size_t lda,
|
| 340 |
+
const cuDoubleComplex* B,
|
| 341 |
+
size_t ldb,
|
| 342 |
+
const double* beta,
|
| 343 |
+
cuDoubleComplex* C,
|
| 344 |
+
size_t ldc);
|
| 345 |
+
|
| 346 |
+
/* -------------------------------------------------------------------- */
|
| 347 |
+
/* TRSM */
|
| 348 |
+
cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
|
| 349 |
+
cublasSideMode_t side,
|
| 350 |
+
cublasFillMode_t uplo,
|
| 351 |
+
cublasOperation_t trans,
|
| 352 |
+
cublasDiagType_t diag,
|
| 353 |
+
size_t m,
|
| 354 |
+
size_t n,
|
| 355 |
+
const float* alpha,
|
| 356 |
+
const float* A,
|
| 357 |
+
size_t lda,
|
| 358 |
+
float* B,
|
| 359 |
+
size_t ldb);
|
| 360 |
+
|
| 361 |
+
cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
|
| 362 |
+
cublasSideMode_t side,
|
| 363 |
+
cublasFillMode_t uplo,
|
| 364 |
+
cublasOperation_t trans,
|
| 365 |
+
cublasDiagType_t diag,
|
| 366 |
+
size_t m,
|
| 367 |
+
size_t n,
|
| 368 |
+
const double* alpha,
|
| 369 |
+
const double* A,
|
| 370 |
+
size_t lda,
|
| 371 |
+
double* B,
|
| 372 |
+
size_t ldb);
|
| 373 |
+
|
| 374 |
+
cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
|
| 375 |
+
cublasSideMode_t side,
|
| 376 |
+
cublasFillMode_t uplo,
|
| 377 |
+
cublasOperation_t trans,
|
| 378 |
+
cublasDiagType_t diag,
|
| 379 |
+
size_t m,
|
| 380 |
+
size_t n,
|
| 381 |
+
const cuComplex* alpha,
|
| 382 |
+
const cuComplex* A,
|
| 383 |
+
size_t lda,
|
| 384 |
+
cuComplex* B,
|
| 385 |
+
size_t ldb);
|
| 386 |
+
|
| 387 |
+
cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
|
| 388 |
+
cublasSideMode_t side,
|
| 389 |
+
cublasFillMode_t uplo,
|
| 390 |
+
cublasOperation_t trans,
|
| 391 |
+
cublasDiagType_t diag,
|
| 392 |
+
size_t m,
|
| 393 |
+
size_t n,
|
| 394 |
+
const cuDoubleComplex* alpha,
|
| 395 |
+
const cuDoubleComplex* A,
|
| 396 |
+
size_t lda,
|
| 397 |
+
cuDoubleComplex* B,
|
| 398 |
+
size_t ldb);
|
| 399 |
+
/* -------------------------------------------------------------------- */
|
| 400 |
+
/* SYMM : Symmetric Multiply Matrix*/
|
| 401 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
|
| 402 |
+
cublasSideMode_t side,
|
| 403 |
+
cublasFillMode_t uplo,
|
| 404 |
+
size_t m,
|
| 405 |
+
size_t n,
|
| 406 |
+
const float* alpha,
|
| 407 |
+
const float* A,
|
| 408 |
+
size_t lda,
|
| 409 |
+
const float* B,
|
| 410 |
+
size_t ldb,
|
| 411 |
+
const float* beta,
|
| 412 |
+
float* C,
|
| 413 |
+
size_t ldc);
|
| 414 |
+
|
| 415 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
|
| 416 |
+
cublasSideMode_t side,
|
| 417 |
+
cublasFillMode_t uplo,
|
| 418 |
+
size_t m,
|
| 419 |
+
size_t n,
|
| 420 |
+
const double* alpha,
|
| 421 |
+
const double* A,
|
| 422 |
+
size_t lda,
|
| 423 |
+
const double* B,
|
| 424 |
+
size_t ldb,
|
| 425 |
+
const double* beta,
|
| 426 |
+
double* C,
|
| 427 |
+
size_t ldc);
|
| 428 |
+
|
| 429 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
|
| 430 |
+
cublasSideMode_t side,
|
| 431 |
+
cublasFillMode_t uplo,
|
| 432 |
+
size_t m,
|
| 433 |
+
size_t n,
|
| 434 |
+
const cuComplex* alpha,
|
| 435 |
+
const cuComplex* A,
|
| 436 |
+
size_t lda,
|
| 437 |
+
const cuComplex* B,
|
| 438 |
+
size_t ldb,
|
| 439 |
+
const cuComplex* beta,
|
| 440 |
+
cuComplex* C,
|
| 441 |
+
size_t ldc);
|
| 442 |
+
|
| 443 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
|
| 444 |
+
cublasSideMode_t side,
|
| 445 |
+
cublasFillMode_t uplo,
|
| 446 |
+
size_t m,
|
| 447 |
+
size_t n,
|
| 448 |
+
const cuDoubleComplex* alpha,
|
| 449 |
+
const cuDoubleComplex* A,
|
| 450 |
+
size_t lda,
|
| 451 |
+
const cuDoubleComplex* B,
|
| 452 |
+
size_t ldb,
|
| 453 |
+
const cuDoubleComplex* beta,
|
| 454 |
+
cuDoubleComplex* C,
|
| 455 |
+
size_t ldc);
|
| 456 |
+
/* -------------------------------------------------------------------- */
|
| 457 |
+
/* HEMM : Hermitian Matrix Multiply */
|
| 458 |
+
cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
|
| 459 |
+
cublasSideMode_t side,
|
| 460 |
+
cublasFillMode_t uplo,
|
| 461 |
+
size_t m,
|
| 462 |
+
size_t n,
|
| 463 |
+
const cuComplex* alpha,
|
| 464 |
+
const cuComplex* A,
|
| 465 |
+
size_t lda,
|
| 466 |
+
const cuComplex* B,
|
| 467 |
+
size_t ldb,
|
| 468 |
+
const cuComplex* beta,
|
| 469 |
+
cuComplex* C,
|
| 470 |
+
size_t ldc);
|
| 471 |
+
|
| 472 |
+
cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
|
| 473 |
+
cublasSideMode_t side,
|
| 474 |
+
cublasFillMode_t uplo,
|
| 475 |
+
size_t m,
|
| 476 |
+
size_t n,
|
| 477 |
+
const cuDoubleComplex* alpha,
|
| 478 |
+
const cuDoubleComplex* A,
|
| 479 |
+
size_t lda,
|
| 480 |
+
const cuDoubleComplex* B,
|
| 481 |
+
size_t ldb,
|
| 482 |
+
const cuDoubleComplex* beta,
|
| 483 |
+
cuDoubleComplex* C,
|
| 484 |
+
size_t ldc);
|
| 485 |
+
|
| 486 |
+
/* -------------------------------------------------------------------- */
|
| 487 |
+
/* SYRKX : variant extension of SYRK */
|
| 488 |
+
cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
|
| 489 |
+
cublasFillMode_t uplo,
|
| 490 |
+
cublasOperation_t trans,
|
| 491 |
+
size_t n,
|
| 492 |
+
size_t k,
|
| 493 |
+
const float* alpha,
|
| 494 |
+
const float* A,
|
| 495 |
+
size_t lda,
|
| 496 |
+
const float* B,
|
| 497 |
+
size_t ldb,
|
| 498 |
+
const float* beta,
|
| 499 |
+
float* C,
|
| 500 |
+
size_t ldc);
|
| 501 |
+
|
| 502 |
+
cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
|
| 503 |
+
cublasFillMode_t uplo,
|
| 504 |
+
cublasOperation_t trans,
|
| 505 |
+
size_t n,
|
| 506 |
+
size_t k,
|
| 507 |
+
const double* alpha,
|
| 508 |
+
const double* A,
|
| 509 |
+
size_t lda,
|
| 510 |
+
const double* B,
|
| 511 |
+
size_t ldb,
|
| 512 |
+
const double* beta,
|
| 513 |
+
double* C,
|
| 514 |
+
size_t ldc);
|
| 515 |
+
|
| 516 |
+
cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
|
| 517 |
+
cublasFillMode_t uplo,
|
| 518 |
+
cublasOperation_t trans,
|
| 519 |
+
size_t n,
|
| 520 |
+
size_t k,
|
| 521 |
+
const cuComplex* alpha,
|
| 522 |
+
const cuComplex* A,
|
| 523 |
+
size_t lda,
|
| 524 |
+
const cuComplex* B,
|
| 525 |
+
size_t ldb,
|
| 526 |
+
const cuComplex* beta,
|
| 527 |
+
cuComplex* C,
|
| 528 |
+
size_t ldc);
|
| 529 |
+
|
| 530 |
+
cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
|
| 531 |
+
cublasFillMode_t uplo,
|
| 532 |
+
cublasOperation_t trans,
|
| 533 |
+
size_t n,
|
| 534 |
+
size_t k,
|
| 535 |
+
const cuDoubleComplex* alpha,
|
| 536 |
+
const cuDoubleComplex* A,
|
| 537 |
+
size_t lda,
|
| 538 |
+
const cuDoubleComplex* B,
|
| 539 |
+
size_t ldb,
|
| 540 |
+
const cuDoubleComplex* beta,
|
| 541 |
+
cuDoubleComplex* C,
|
| 542 |
+
size_t ldc);
|
| 543 |
+
/* -------------------------------------------------------------------- */
|
| 544 |
+
/* HER2K : variant extension of HERK */
|
| 545 |
+
cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
|
| 546 |
+
cublasFillMode_t uplo,
|
| 547 |
+
cublasOperation_t trans,
|
| 548 |
+
size_t n,
|
| 549 |
+
size_t k,
|
| 550 |
+
const cuComplex* alpha,
|
| 551 |
+
const cuComplex* A,
|
| 552 |
+
size_t lda,
|
| 553 |
+
const cuComplex* B,
|
| 554 |
+
size_t ldb,
|
| 555 |
+
const float* beta,
|
| 556 |
+
cuComplex* C,
|
| 557 |
+
size_t ldc);
|
| 558 |
+
|
| 559 |
+
cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
|
| 560 |
+
cublasFillMode_t uplo,
|
| 561 |
+
cublasOperation_t trans,
|
| 562 |
+
size_t n,
|
| 563 |
+
size_t k,
|
| 564 |
+
const cuDoubleComplex* alpha,
|
| 565 |
+
const cuDoubleComplex* A,
|
| 566 |
+
size_t lda,
|
| 567 |
+
const cuDoubleComplex* B,
|
| 568 |
+
size_t ldb,
|
| 569 |
+
const double* beta,
|
| 570 |
+
cuDoubleComplex* C,
|
| 571 |
+
size_t ldc);
|
| 572 |
+
|
| 573 |
+
/* -------------------------------------------------------------------- */
|
| 574 |
+
/* SPMM : Symmetric Packed Multiply Matrix*/
|
| 575 |
+
cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
|
| 576 |
+
cublasSideMode_t side,
|
| 577 |
+
cublasFillMode_t uplo,
|
| 578 |
+
size_t m,
|
| 579 |
+
size_t n,
|
| 580 |
+
const float* alpha,
|
| 581 |
+
const float* AP,
|
| 582 |
+
const float* B,
|
| 583 |
+
size_t ldb,
|
| 584 |
+
const float* beta,
|
| 585 |
+
float* C,
|
| 586 |
+
size_t ldc);
|
| 587 |
+
|
| 588 |
+
cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
|
| 589 |
+
cublasSideMode_t side,
|
| 590 |
+
cublasFillMode_t uplo,
|
| 591 |
+
size_t m,
|
| 592 |
+
size_t n,
|
| 593 |
+
const double* alpha,
|
| 594 |
+
const double* AP,
|
| 595 |
+
const double* B,
|
| 596 |
+
size_t ldb,
|
| 597 |
+
const double* beta,
|
| 598 |
+
double* C,
|
| 599 |
+
size_t ldc);
|
| 600 |
+
|
| 601 |
+
cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
|
| 602 |
+
cublasSideMode_t side,
|
| 603 |
+
cublasFillMode_t uplo,
|
| 604 |
+
size_t m,
|
| 605 |
+
size_t n,
|
| 606 |
+
const cuComplex* alpha,
|
| 607 |
+
const cuComplex* AP,
|
| 608 |
+
const cuComplex* B,
|
| 609 |
+
size_t ldb,
|
| 610 |
+
const cuComplex* beta,
|
| 611 |
+
cuComplex* C,
|
| 612 |
+
size_t ldc);
|
| 613 |
+
|
| 614 |
+
cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
|
| 615 |
+
cublasSideMode_t side,
|
| 616 |
+
cublasFillMode_t uplo,
|
| 617 |
+
size_t m,
|
| 618 |
+
size_t n,
|
| 619 |
+
const cuDoubleComplex* alpha,
|
| 620 |
+
const cuDoubleComplex* AP,
|
| 621 |
+
const cuDoubleComplex* B,
|
| 622 |
+
size_t ldb,
|
| 623 |
+
const cuDoubleComplex* beta,
|
| 624 |
+
cuDoubleComplex* C,
|
| 625 |
+
size_t ldc);
|
| 626 |
+
|
| 627 |
+
/* -------------------------------------------------------------------- */
|
| 628 |
+
/* TRMM */
|
| 629 |
+
cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
|
| 630 |
+
cublasSideMode_t side,
|
| 631 |
+
cublasFillMode_t uplo,
|
| 632 |
+
cublasOperation_t trans,
|
| 633 |
+
cublasDiagType_t diag,
|
| 634 |
+
size_t m,
|
| 635 |
+
size_t n,
|
| 636 |
+
const float* alpha,
|
| 637 |
+
const float* A,
|
| 638 |
+
size_t lda,
|
| 639 |
+
const float* B,
|
| 640 |
+
size_t ldb,
|
| 641 |
+
float* C,
|
| 642 |
+
size_t ldc);
|
| 643 |
+
|
| 644 |
+
cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
|
| 645 |
+
cublasSideMode_t side,
|
| 646 |
+
cublasFillMode_t uplo,
|
| 647 |
+
cublasOperation_t trans,
|
| 648 |
+
cublasDiagType_t diag,
|
| 649 |
+
size_t m,
|
| 650 |
+
size_t n,
|
| 651 |
+
const double* alpha,
|
| 652 |
+
const double* A,
|
| 653 |
+
size_t lda,
|
| 654 |
+
const double* B,
|
| 655 |
+
size_t ldb,
|
| 656 |
+
double* C,
|
| 657 |
+
size_t ldc);
|
| 658 |
+
|
| 659 |
+
cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
|
| 660 |
+
cublasSideMode_t side,
|
| 661 |
+
cublasFillMode_t uplo,
|
| 662 |
+
cublasOperation_t trans,
|
| 663 |
+
cublasDiagType_t diag,
|
| 664 |
+
size_t m,
|
| 665 |
+
size_t n,
|
| 666 |
+
const cuComplex* alpha,
|
| 667 |
+
const cuComplex* A,
|
| 668 |
+
size_t lda,
|
| 669 |
+
const cuComplex* B,
|
| 670 |
+
size_t ldb,
|
| 671 |
+
cuComplex* C,
|
| 672 |
+
size_t ldc);
|
| 673 |
+
|
| 674 |
+
cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
|
| 675 |
+
cublasSideMode_t side,
|
| 676 |
+
cublasFillMode_t uplo,
|
| 677 |
+
cublasOperation_t trans,
|
| 678 |
+
cublasDiagType_t diag,
|
| 679 |
+
size_t m,
|
| 680 |
+
size_t n,
|
| 681 |
+
const cuDoubleComplex* alpha,
|
| 682 |
+
const cuDoubleComplex* A,
|
| 683 |
+
size_t lda,
|
| 684 |
+
const cuDoubleComplex* B,
|
| 685 |
+
size_t ldb,
|
| 686 |
+
cuDoubleComplex* C,
|
| 687 |
+
size_t ldc);
|
| 688 |
+
|
| 689 |
+
#if defined(__cplusplus)
|
| 690 |
+
}
|
| 691 |
+
#endif /* __cplusplus */
|
| 692 |
+
|
| 693 |
+
#endif /* !defined(CUBLAS_XT_H_) */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_api.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_v2.h
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* This is the public header file for the new CUBLAS library API, it mapped the generic
|
| 52 |
+
* Cublas name functions to the actual _v2 implementations.
|
| 53 |
+
*/
|
| 54 |
+
|
| 55 |
+
#if !defined(CUBLAS_V2_H_)
|
| 56 |
+
#define CUBLAS_V2_H_
|
| 57 |
+
|
| 58 |
+
#if defined(CUBLAS_H_)
|
| 59 |
+
#error "It is an error to include both cublas.h and cublas_v2.h"
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#undef CUBLASAPI
|
| 63 |
+
#ifdef __CUDACC__
|
| 64 |
+
#define CUBLASAPI __host__ __device__
|
| 65 |
+
#else
|
| 66 |
+
#define CUBLASAPI
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
#include "cublas_api.h"
|
| 70 |
+
|
| 71 |
+
#define cublasCreate cublasCreate_v2
|
| 72 |
+
#define cublasDestroy cublasDestroy_v2
|
| 73 |
+
#define cublasGetVersion cublasGetVersion_v2
|
| 74 |
+
#define cublasSetWorkspace cublasSetWorkspace_v2
|
| 75 |
+
#define cublasSetStream cublasSetStream_v2
|
| 76 |
+
#define cublasGetStream cublasGetStream_v2
|
| 77 |
+
#define cublasGetPointerMode cublasGetPointerMode_v2
|
| 78 |
+
#define cublasSetPointerMode cublasSetPointerMode_v2
|
| 79 |
+
|
| 80 |
+
/* 32-bit integer */
|
| 81 |
+
|
| 82 |
+
/* Blas1 Routines */
|
| 83 |
+
|
| 84 |
+
#define cublasSnrm2 cublasSnrm2_v2
|
| 85 |
+
#define cublasDnrm2 cublasDnrm2_v2
|
| 86 |
+
#define cublasScnrm2 cublasScnrm2_v2
|
| 87 |
+
#define cublasDznrm2 cublasDznrm2_v2
|
| 88 |
+
|
| 89 |
+
#define cublasSdot cublasSdot_v2
|
| 90 |
+
#define cublasDdot cublasDdot_v2
|
| 91 |
+
#define cublasCdotu cublasCdotu_v2
|
| 92 |
+
#define cublasCdotc cublasCdotc_v2
|
| 93 |
+
#define cublasZdotu cublasZdotu_v2
|
| 94 |
+
#define cublasZdotc cublasZdotc_v2
|
| 95 |
+
|
| 96 |
+
#define cublasSscal cublasSscal_v2
|
| 97 |
+
#define cublasDscal cublasDscal_v2
|
| 98 |
+
#define cublasCscal cublasCscal_v2
|
| 99 |
+
#define cublasCsscal cublasCsscal_v2
|
| 100 |
+
#define cublasZscal cublasZscal_v2
|
| 101 |
+
#define cublasZdscal cublasZdscal_v2
|
| 102 |
+
|
| 103 |
+
#define cublasSaxpy cublasSaxpy_v2
|
| 104 |
+
#define cublasDaxpy cublasDaxpy_v2
|
| 105 |
+
#define cublasCaxpy cublasCaxpy_v2
|
| 106 |
+
#define cublasZaxpy cublasZaxpy_v2
|
| 107 |
+
|
| 108 |
+
#define cublasScopy cublasScopy_v2
|
| 109 |
+
#define cublasDcopy cublasDcopy_v2
|
| 110 |
+
#define cublasCcopy cublasCcopy_v2
|
| 111 |
+
#define cublasZcopy cublasZcopy_v2
|
| 112 |
+
|
| 113 |
+
#define cublasSswap cublasSswap_v2
|
| 114 |
+
#define cublasDswap cublasDswap_v2
|
| 115 |
+
#define cublasCswap cublasCswap_v2
|
| 116 |
+
#define cublasZswap cublasZswap_v2
|
| 117 |
+
|
| 118 |
+
#define cublasIsamax cublasIsamax_v2
|
| 119 |
+
#define cublasIdamax cublasIdamax_v2
|
| 120 |
+
#define cublasIcamax cublasIcamax_v2
|
| 121 |
+
#define cublasIzamax cublasIzamax_v2
|
| 122 |
+
|
| 123 |
+
#define cublasIsamin cublasIsamin_v2
|
| 124 |
+
#define cublasIdamin cublasIdamin_v2
|
| 125 |
+
#define cublasIcamin cublasIcamin_v2
|
| 126 |
+
#define cublasIzamin cublasIzamin_v2
|
| 127 |
+
|
| 128 |
+
#define cublasSasum cublasSasum_v2
|
| 129 |
+
#define cublasDasum cublasDasum_v2
|
| 130 |
+
#define cublasScasum cublasScasum_v2
|
| 131 |
+
#define cublasDzasum cublasDzasum_v2
|
| 132 |
+
|
| 133 |
+
#define cublasSrot cublasSrot_v2
|
| 134 |
+
#define cublasDrot cublasDrot_v2
|
| 135 |
+
#define cublasCrot cublasCrot_v2
|
| 136 |
+
#define cublasCsrot cublasCsrot_v2
|
| 137 |
+
#define cublasZrot cublasZrot_v2
|
| 138 |
+
#define cublasZdrot cublasZdrot_v2
|
| 139 |
+
|
| 140 |
+
#define cublasSrotg cublasSrotg_v2
|
| 141 |
+
#define cublasDrotg cublasDrotg_v2
|
| 142 |
+
#define cublasCrotg cublasCrotg_v2
|
| 143 |
+
#define cublasZrotg cublasZrotg_v2
|
| 144 |
+
|
| 145 |
+
#define cublasSrotm cublasSrotm_v2
|
| 146 |
+
#define cublasDrotm cublasDrotm_v2
|
| 147 |
+
|
| 148 |
+
#define cublasSrotmg cublasSrotmg_v2
|
| 149 |
+
#define cublasDrotmg cublasDrotmg_v2
|
| 150 |
+
|
| 151 |
+
/* Blas2 Routines */
|
| 152 |
+
|
| 153 |
+
#define cublasSgemv cublasSgemv_v2
|
| 154 |
+
#define cublasDgemv cublasDgemv_v2
|
| 155 |
+
#define cublasCgemv cublasCgemv_v2
|
| 156 |
+
#define cublasZgemv cublasZgemv_v2
|
| 157 |
+
|
| 158 |
+
#define cublasSgbmv cublasSgbmv_v2
|
| 159 |
+
#define cublasDgbmv cublasDgbmv_v2
|
| 160 |
+
#define cublasCgbmv cublasCgbmv_v2
|
| 161 |
+
#define cublasZgbmv cublasZgbmv_v2
|
| 162 |
+
|
| 163 |
+
#define cublasStrmv cublasStrmv_v2
|
| 164 |
+
#define cublasDtrmv cublasDtrmv_v2
|
| 165 |
+
#define cublasCtrmv cublasCtrmv_v2
|
| 166 |
+
#define cublasZtrmv cublasZtrmv_v2
|
| 167 |
+
|
| 168 |
+
#define cublasStbmv cublasStbmv_v2
|
| 169 |
+
#define cublasDtbmv cublasDtbmv_v2
|
| 170 |
+
#define cublasCtbmv cublasCtbmv_v2
|
| 171 |
+
#define cublasZtbmv cublasZtbmv_v2
|
| 172 |
+
|
| 173 |
+
#define cublasStpmv cublasStpmv_v2
|
| 174 |
+
#define cublasDtpmv cublasDtpmv_v2
|
| 175 |
+
#define cublasCtpmv cublasCtpmv_v2
|
| 176 |
+
#define cublasZtpmv cublasZtpmv_v2
|
| 177 |
+
|
| 178 |
+
#define cublasStrsv cublasStrsv_v2
|
| 179 |
+
#define cublasDtrsv cublasDtrsv_v2
|
| 180 |
+
#define cublasCtrsv cublasCtrsv_v2
|
| 181 |
+
#define cublasZtrsv cublasZtrsv_v2
|
| 182 |
+
|
| 183 |
+
#define cublasStpsv cublasStpsv_v2
|
| 184 |
+
#define cublasDtpsv cublasDtpsv_v2
|
| 185 |
+
#define cublasCtpsv cublasCtpsv_v2
|
| 186 |
+
#define cublasZtpsv cublasZtpsv_v2
|
| 187 |
+
|
| 188 |
+
#define cublasStbsv cublasStbsv_v2
|
| 189 |
+
#define cublasDtbsv cublasDtbsv_v2
|
| 190 |
+
#define cublasCtbsv cublasCtbsv_v2
|
| 191 |
+
#define cublasZtbsv cublasZtbsv_v2
|
| 192 |
+
|
| 193 |
+
#define cublasSsymv cublasSsymv_v2
|
| 194 |
+
#define cublasDsymv cublasDsymv_v2
|
| 195 |
+
#define cublasCsymv cublasCsymv_v2
|
| 196 |
+
#define cublasZsymv cublasZsymv_v2
|
| 197 |
+
#define cublasChemv cublasChemv_v2
|
| 198 |
+
#define cublasZhemv cublasZhemv_v2
|
| 199 |
+
|
| 200 |
+
#define cublasSsbmv cublasSsbmv_v2
|
| 201 |
+
#define cublasDsbmv cublasDsbmv_v2
|
| 202 |
+
#define cublasChbmv cublasChbmv_v2
|
| 203 |
+
#define cublasZhbmv cublasZhbmv_v2
|
| 204 |
+
|
| 205 |
+
#define cublasSspmv cublasSspmv_v2
|
| 206 |
+
#define cublasDspmv cublasDspmv_v2
|
| 207 |
+
#define cublasChpmv cublasChpmv_v2
|
| 208 |
+
#define cublasZhpmv cublasZhpmv_v2
|
| 209 |
+
|
| 210 |
+
#define cublasSger cublasSger_v2
|
| 211 |
+
#define cublasDger cublasDger_v2
|
| 212 |
+
#define cublasCgeru cublasCgeru_v2
|
| 213 |
+
#define cublasCgerc cublasCgerc_v2
|
| 214 |
+
#define cublasZgeru cublasZgeru_v2
|
| 215 |
+
#define cublasZgerc cublasZgerc_v2
|
| 216 |
+
|
| 217 |
+
#define cublasSsyr cublasSsyr_v2
|
| 218 |
+
#define cublasDsyr cublasDsyr_v2
|
| 219 |
+
#define cublasCsyr cublasCsyr_v2
|
| 220 |
+
#define cublasZsyr cublasZsyr_v2
|
| 221 |
+
#define cublasCher cublasCher_v2
|
| 222 |
+
#define cublasZher cublasZher_v2
|
| 223 |
+
|
| 224 |
+
#define cublasSspr cublasSspr_v2
|
| 225 |
+
#define cublasDspr cublasDspr_v2
|
| 226 |
+
#define cublasChpr cublasChpr_v2
|
| 227 |
+
#define cublasZhpr cublasZhpr_v2
|
| 228 |
+
|
| 229 |
+
#define cublasSsyr2 cublasSsyr2_v2
|
| 230 |
+
#define cublasDsyr2 cublasDsyr2_v2
|
| 231 |
+
#define cublasCsyr2 cublasCsyr2_v2
|
| 232 |
+
#define cublasZsyr2 cublasZsyr2_v2
|
| 233 |
+
#define cublasCher2 cublasCher2_v2
|
| 234 |
+
#define cublasZher2 cublasZher2_v2
|
| 235 |
+
|
| 236 |
+
#define cublasSspr2 cublasSspr2_v2
|
| 237 |
+
#define cublasDspr2 cublasDspr2_v2
|
| 238 |
+
#define cublasChpr2 cublasChpr2_v2
|
| 239 |
+
#define cublasZhpr2 cublasZhpr2_v2
|
| 240 |
+
|
| 241 |
+
/* Blas3 Routines */
|
| 242 |
+
|
| 243 |
+
#define cublasSgemm cublasSgemm_v2
|
| 244 |
+
#define cublasDgemm cublasDgemm_v2
|
| 245 |
+
#define cublasCgemm cublasCgemm_v2
|
| 246 |
+
#define cublasZgemm cublasZgemm_v2
|
| 247 |
+
|
| 248 |
+
#define cublasSsyrk cublasSsyrk_v2
|
| 249 |
+
#define cublasDsyrk cublasDsyrk_v2
|
| 250 |
+
#define cublasCsyrk cublasCsyrk_v2
|
| 251 |
+
#define cublasZsyrk cublasZsyrk_v2
|
| 252 |
+
#define cublasCherk cublasCherk_v2
|
| 253 |
+
#define cublasZherk cublasZherk_v2
|
| 254 |
+
|
| 255 |
+
#define cublasSsyr2k cublasSsyr2k_v2
|
| 256 |
+
#define cublasDsyr2k cublasDsyr2k_v2
|
| 257 |
+
#define cublasCsyr2k cublasCsyr2k_v2
|
| 258 |
+
#define cublasZsyr2k cublasZsyr2k_v2
|
| 259 |
+
#define cublasCher2k cublasCher2k_v2
|
| 260 |
+
#define cublasZher2k cublasZher2k_v2
|
| 261 |
+
|
| 262 |
+
#define cublasSsymm cublasSsymm_v2
|
| 263 |
+
#define cublasDsymm cublasDsymm_v2
|
| 264 |
+
#define cublasCsymm cublasCsymm_v2
|
| 265 |
+
#define cublasZsymm cublasZsymm_v2
|
| 266 |
+
#define cublasChemm cublasChemm_v2
|
| 267 |
+
#define cublasZhemm cublasZhemm_v2
|
| 268 |
+
|
| 269 |
+
#define cublasStrsm cublasStrsm_v2
|
| 270 |
+
#define cublasDtrsm cublasDtrsm_v2
|
| 271 |
+
#define cublasCtrsm cublasCtrsm_v2
|
| 272 |
+
#define cublasZtrsm cublasZtrsm_v2
|
| 273 |
+
|
| 274 |
+
#define cublasStrmm cublasStrmm_v2
|
| 275 |
+
#define cublasDtrmm cublasDtrmm_v2
|
| 276 |
+
#define cublasCtrmm cublasCtrmm_v2
|
| 277 |
+
#define cublasZtrmm cublasZtrmm_v2
|
| 278 |
+
|
| 279 |
+
/* 64-bit integer */
|
| 280 |
+
|
| 281 |
+
/* Blas1 Routines */
|
| 282 |
+
|
| 283 |
+
#define cublasSnrm2_64 cublasSnrm2_v2_64
|
| 284 |
+
#define cublasDnrm2_64 cublasDnrm2_v2_64
|
| 285 |
+
#define cublasScnrm2_64 cublasScnrm2_v2_64
|
| 286 |
+
#define cublasDznrm2_64 cublasDznrm2_v2_64
|
| 287 |
+
|
| 288 |
+
#define cublasSdot_64 cublasSdot_v2_64
|
| 289 |
+
#define cublasDdot_64 cublasDdot_v2_64
|
| 290 |
+
#define cublasCdotu_64 cublasCdotu_v2_64
|
| 291 |
+
#define cublasCdotc_64 cublasCdotc_v2_64
|
| 292 |
+
#define cublasZdotu_64 cublasZdotu_v2_64
|
| 293 |
+
#define cublasZdotc_64 cublasZdotc_v2_64
|
| 294 |
+
|
| 295 |
+
#define cublasSscal_64 cublasSscal_v2_64
|
| 296 |
+
#define cublasDscal_64 cublasDscal_v2_64
|
| 297 |
+
#define cublasCscal_64 cublasCscal_v2_64
|
| 298 |
+
#define cublasCsscal_64 cublasCsscal_v2_64
|
| 299 |
+
#define cublasZscal_64 cublasZscal_v2_64
|
| 300 |
+
#define cublasZdscal_64 cublasZdscal_v2_64
|
| 301 |
+
|
| 302 |
+
#define cublasSaxpy_64 cublasSaxpy_v2_64
|
| 303 |
+
#define cublasDaxpy_64 cublasDaxpy_v2_64
|
| 304 |
+
#define cublasCaxpy_64 cublasCaxpy_v2_64
|
| 305 |
+
#define cublasZaxpy_64 cublasZaxpy_v2_64
|
| 306 |
+
|
| 307 |
+
#define cublasScopy_64 cublasScopy_v2_64
|
| 308 |
+
#define cublasDcopy_64 cublasDcopy_v2_64
|
| 309 |
+
#define cublasCcopy_64 cublasCcopy_v2_64
|
| 310 |
+
#define cublasZcopy_64 cublasZcopy_v2_64
|
| 311 |
+
|
| 312 |
+
#define cublasSswap_64 cublasSswap_v2_64
|
| 313 |
+
#define cublasDswap_64 cublasDswap_v2_64
|
| 314 |
+
#define cublasCswap_64 cublasCswap_v2_64
|
| 315 |
+
#define cublasZswap_64 cublasZswap_v2_64
|
| 316 |
+
|
| 317 |
+
#define cublasIsamax_64 cublasIsamax_v2_64
|
| 318 |
+
#define cublasIdamax_64 cublasIdamax_v2_64
|
| 319 |
+
#define cublasIcamax_64 cublasIcamax_v2_64
|
| 320 |
+
#define cublasIzamax_64 cublasIzamax_v2_64
|
| 321 |
+
|
| 322 |
+
#define cublasIsamin_64 cublasIsamin_v2_64
|
| 323 |
+
#define cublasIdamin_64 cublasIdamin_v2_64
|
| 324 |
+
#define cublasIcamin_64 cublasIcamin_v2_64
|
| 325 |
+
#define cublasIzamin_64 cublasIzamin_v2_64
|
| 326 |
+
|
| 327 |
+
#define cublasSasum_64 cublasSasum_v2_64
|
| 328 |
+
#define cublasDasum_64 cublasDasum_v2_64
|
| 329 |
+
#define cublasScasum_64 cublasScasum_v2_64
|
| 330 |
+
#define cublasDzasum_64 cublasDzasum_v2_64
|
| 331 |
+
|
| 332 |
+
#define cublasSrot_64 cublasSrot_v2_64
|
| 333 |
+
#define cublasDrot_64 cublasDrot_v2_64
|
| 334 |
+
#define cublasCrot_64 cublasCrot_v2_64
|
| 335 |
+
#define cublasCsrot_64 cublasCsrot_v2_64
|
| 336 |
+
#define cublasZrot_64 cublasZrot_v2_64
|
| 337 |
+
#define cublasZdrot_64 cublasZdrot_v2_64
|
| 338 |
+
|
| 339 |
+
#define cublasSrotg_64 cublasSrotg_v2_64
|
| 340 |
+
#define cublasDrotg_64 cublasDrotg_v2_64
|
| 341 |
+
#define cublasCrotg_64 cublasCrotg_v2_64
|
| 342 |
+
#define cublasZrotg_64 cublasZrotg_v2_64
|
| 343 |
+
|
| 344 |
+
#define cublasSrotm_64 cublasSrotm_v2_64
|
| 345 |
+
#define cublasDrotm_64 cublasDrotm_v2_64
|
| 346 |
+
|
| 347 |
+
#define cublasSrotmg_64 cublasSrotmg_v2_64
|
| 348 |
+
#define cublasDrotmg_64 cublasDrotmg_v2_64
|
| 349 |
+
|
| 350 |
+
/* Blas2 Routines */
|
| 351 |
+
|
| 352 |
+
#define cublasSgemv_64 cublasSgemv_v2_64
|
| 353 |
+
#define cublasDgemv_64 cublasDgemv_v2_64
|
| 354 |
+
#define cublasCgemv_64 cublasCgemv_v2_64
|
| 355 |
+
#define cublasZgemv_64 cublasZgemv_v2_64
|
| 356 |
+
|
| 357 |
+
#define cublasSgbmv_64 cublasSgbmv_v2_64
|
| 358 |
+
#define cublasDgbmv_64 cublasDgbmv_v2_64
|
| 359 |
+
#define cublasCgbmv_64 cublasCgbmv_v2_64
|
| 360 |
+
#define cublasZgbmv_64 cublasZgbmv_v2_64
|
| 361 |
+
|
| 362 |
+
#define cublasStrmv_64 cublasStrmv_v2_64
|
| 363 |
+
#define cublasDtrmv_64 cublasDtrmv_v2_64
|
| 364 |
+
#define cublasCtrmv_64 cublasCtrmv_v2_64
|
| 365 |
+
#define cublasZtrmv_64 cublasZtrmv_v2_64
|
| 366 |
+
|
| 367 |
+
#define cublasStbmv_64 cublasStbmv_v2_64
|
| 368 |
+
#define cublasDtbmv_64 cublasDtbmv_v2_64
|
| 369 |
+
#define cublasCtbmv_64 cublasCtbmv_v2_64
|
| 370 |
+
#define cublasZtbmv_64 cublasZtbmv_v2_64
|
| 371 |
+
|
| 372 |
+
#define cublasStpmv_64 cublasStpmv_v2_64
|
| 373 |
+
#define cublasDtpmv_64 cublasDtpmv_v2_64
|
| 374 |
+
#define cublasCtpmv_64 cublasCtpmv_v2_64
|
| 375 |
+
#define cublasZtpmv_64 cublasZtpmv_v2_64
|
| 376 |
+
|
| 377 |
+
#define cublasStrsv_64 cublasStrsv_v2_64
|
| 378 |
+
#define cublasDtrsv_64 cublasDtrsv_v2_64
|
| 379 |
+
#define cublasCtrsv_64 cublasCtrsv_v2_64
|
| 380 |
+
#define cublasZtrsv_64 cublasZtrsv_v2_64
|
| 381 |
+
|
| 382 |
+
#define cublasStpsv_64 cublasStpsv_v2_64
|
| 383 |
+
#define cublasDtpsv_64 cublasDtpsv_v2_64
|
| 384 |
+
#define cublasCtpsv_64 cublasCtpsv_v2_64
|
| 385 |
+
#define cublasZtpsv_64 cublasZtpsv_v2_64
|
| 386 |
+
|
| 387 |
+
#define cublasStbsv_64 cublasStbsv_v2_64
|
| 388 |
+
#define cublasDtbsv_64 cublasDtbsv_v2_64
|
| 389 |
+
#define cublasCtbsv_64 cublasCtbsv_v2_64
|
| 390 |
+
#define cublasZtbsv_64 cublasZtbsv_v2_64
|
| 391 |
+
|
| 392 |
+
#define cublasSsymv_64 cublasSsymv_v2_64
|
| 393 |
+
#define cublasDsymv_64 cublasDsymv_v2_64
|
| 394 |
+
#define cublasCsymv_64 cublasCsymv_v2_64
|
| 395 |
+
#define cublasZsymv_64 cublasZsymv_v2_64
|
| 396 |
+
#define cublasChemv_64 cublasChemv_v2_64
|
| 397 |
+
#define cublasZhemv_64 cublasZhemv_v2_64
|
| 398 |
+
|
| 399 |
+
#define cublasSsbmv_64 cublasSsbmv_v2_64
|
| 400 |
+
#define cublasDsbmv_64 cublasDsbmv_v2_64
|
| 401 |
+
#define cublasChbmv_64 cublasChbmv_v2_64
|
| 402 |
+
#define cublasZhbmv_64 cublasZhbmv_v2_64
|
| 403 |
+
|
| 404 |
+
#define cublasSspmv_64 cublasSspmv_v2_64
|
| 405 |
+
#define cublasDspmv_64 cublasDspmv_v2_64
|
| 406 |
+
#define cublasChpmv_64 cublasChpmv_v2_64
|
| 407 |
+
#define cublasZhpmv_64 cublasZhpmv_v2_64
|
| 408 |
+
|
| 409 |
+
#define cublasSger_64 cublasSger_v2_64
|
| 410 |
+
#define cublasDger_64 cublasDger_v2_64
|
| 411 |
+
#define cublasCgeru_64 cublasCgeru_v2_64
|
| 412 |
+
#define cublasCgerc_64 cublasCgerc_v2_64
|
| 413 |
+
#define cublasZgeru_64 cublasZgeru_v2_64
|
| 414 |
+
#define cublasZgerc_64 cublasZgerc_v2_64
|
| 415 |
+
|
| 416 |
+
#define cublasSsyr_64 cublasSsyr_v2_64
|
| 417 |
+
#define cublasDsyr_64 cublasDsyr_v2_64
|
| 418 |
+
#define cublasCsyr_64 cublasCsyr_v2_64
|
| 419 |
+
#define cublasZsyr_64 cublasZsyr_v2_64
|
| 420 |
+
#define cublasCher_64 cublasCher_v2_64
|
| 421 |
+
#define cublasZher_64 cublasZher_v2_64
|
| 422 |
+
|
| 423 |
+
#define cublasSspr_64 cublasSspr_v2_64
|
| 424 |
+
#define cublasDspr_64 cublasDspr_v2_64
|
| 425 |
+
#define cublasChpr_64 cublasChpr_v2_64
|
| 426 |
+
#define cublasZhpr_64 cublasZhpr_v2_64
|
| 427 |
+
|
| 428 |
+
#define cublasSsyr2_64 cublasSsyr2_v2_64
|
| 429 |
+
#define cublasDsyr2_64 cublasDsyr2_v2_64
|
| 430 |
+
#define cublasCsyr2_64 cublasCsyr2_v2_64
|
| 431 |
+
#define cublasZsyr2_64 cublasZsyr2_v2_64
|
| 432 |
+
#define cublasCher2_64 cublasCher2_v2_64
|
| 433 |
+
#define cublasZher2_64 cublasZher2_v2_64
|
| 434 |
+
|
| 435 |
+
#define cublasSspr2_64 cublasSspr2_v2_64
|
| 436 |
+
#define cublasDspr2_64 cublasDspr2_v2_64
|
| 437 |
+
#define cublasChpr2_64 cublasChpr2_v2_64
|
| 438 |
+
#define cublasZhpr2_64 cublasZhpr2_v2_64
|
| 439 |
+
|
| 440 |
+
/* Blas3 Routines */
|
| 441 |
+
|
| 442 |
+
#define cublasSgemm_64 cublasSgemm_v2_64
|
| 443 |
+
#define cublasDgemm_64 cublasDgemm_v2_64
|
| 444 |
+
#define cublasCgemm_64 cublasCgemm_v2_64
|
| 445 |
+
#define cublasZgemm_64 cublasZgemm_v2_64
|
| 446 |
+
|
| 447 |
+
#define cublasSsyrk_64 cublasSsyrk_v2_64
|
| 448 |
+
#define cublasDsyrk_64 cublasDsyrk_v2_64
|
| 449 |
+
#define cublasCsyrk_64 cublasCsyrk_v2_64
|
| 450 |
+
#define cublasZsyrk_64 cublasZsyrk_v2_64
|
| 451 |
+
#define cublasCherk_64 cublasCherk_v2_64
|
| 452 |
+
#define cublasZherk_64 cublasZherk_v2_64
|
| 453 |
+
|
| 454 |
+
#define cublasSsyr2k_64 cublasSsyr2k_v2_64
|
| 455 |
+
#define cublasDsyr2k_64 cublasDsyr2k_v2_64
|
| 456 |
+
#define cublasCsyr2k_64 cublasCsyr2k_v2_64
|
| 457 |
+
#define cublasZsyr2k_64 cublasZsyr2k_v2_64
|
| 458 |
+
#define cublasCher2k_64 cublasCher2k_v2_64
|
| 459 |
+
#define cublasZher2k_64 cublasZher2k_v2_64
|
| 460 |
+
|
| 461 |
+
#define cublasSsymm_64 cublasSsymm_v2_64
|
| 462 |
+
#define cublasDsymm_64 cublasDsymm_v2_64
|
| 463 |
+
#define cublasCsymm_64 cublasCsymm_v2_64
|
| 464 |
+
#define cublasZsymm_64 cublasZsymm_v2_64
|
| 465 |
+
#define cublasChemm_64 cublasChemm_v2_64
|
| 466 |
+
#define cublasZhemm_64 cublasZhemm_v2_64
|
| 467 |
+
|
| 468 |
+
#define cublasStrsm_64 cublasStrsm_v2_64
|
| 469 |
+
#define cublasDtrsm_64 cublasDtrsm_v2_64
|
| 470 |
+
#define cublasCtrsm_64 cublasCtrsm_v2_64
|
| 471 |
+
#define cublasZtrsm_64 cublasZtrsm_v2_64
|
| 472 |
+
|
| 473 |
+
#define cublasStrmm_64 cublasStrmm_v2_64
|
| 474 |
+
#define cublasDtrmm_64 cublasDtrmm_v2_64
|
| 475 |
+
#define cublasCtrmm_64 cublasCtrmm_v2_64
|
| 476 |
+
#define cublasZtrmm_64 cublasZtrmm_v2_64
|
| 477 |
+
|
| 478 |
+
#endif /* !defined(CUBLAS_V2_H_) */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/nvblas.h
ADDED
|
@@ -0,0 +1,824 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(NVBLAS_H_)
|
| 51 |
+
#define NVBLAS_H_
|
| 52 |
+
|
| 53 |
+
#include "driver_types.h"
|
| 54 |
+
#include "cuComplex.h" /* import complex data type */
|
| 55 |
+
|
| 56 |
+
#if defined(__cplusplus)
|
| 57 |
+
extern "C" {
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
/* GEMM */
|
| 61 |
+
void sgemm_(const char* transa,
|
| 62 |
+
const char* transb,
|
| 63 |
+
const int* m,
|
| 64 |
+
const int* n,
|
| 65 |
+
const int* k,
|
| 66 |
+
const float* alpha,
|
| 67 |
+
const float* a,
|
| 68 |
+
const int* lda,
|
| 69 |
+
const float* b,
|
| 70 |
+
const int* ldb,
|
| 71 |
+
const float* beta,
|
| 72 |
+
float* c,
|
| 73 |
+
const int* ldc);
|
| 74 |
+
|
| 75 |
+
void dgemm_(const char* transa,
|
| 76 |
+
const char* transb,
|
| 77 |
+
const int* m,
|
| 78 |
+
const int* n,
|
| 79 |
+
const int* k,
|
| 80 |
+
const double* alpha,
|
| 81 |
+
const double* a,
|
| 82 |
+
const int* lda,
|
| 83 |
+
const double* b,
|
| 84 |
+
const int* ldb,
|
| 85 |
+
const double* beta,
|
| 86 |
+
double* c,
|
| 87 |
+
const int* ldc);
|
| 88 |
+
|
| 89 |
+
void cgemm_(const char* transa,
|
| 90 |
+
const char* transb,
|
| 91 |
+
const int* m,
|
| 92 |
+
const int* n,
|
| 93 |
+
const int* k,
|
| 94 |
+
const cuComplex* alpha,
|
| 95 |
+
const cuComplex* a,
|
| 96 |
+
const int* lda,
|
| 97 |
+
const cuComplex* b,
|
| 98 |
+
const int* ldb,
|
| 99 |
+
const cuComplex* beta,
|
| 100 |
+
cuComplex* c,
|
| 101 |
+
const int* ldc);
|
| 102 |
+
|
| 103 |
+
void zgemm_(const char* transa,
|
| 104 |
+
const char* transb,
|
| 105 |
+
const int* m,
|
| 106 |
+
const int* n,
|
| 107 |
+
const int* k,
|
| 108 |
+
const cuDoubleComplex* alpha,
|
| 109 |
+
const cuDoubleComplex* a,
|
| 110 |
+
const int* lda,
|
| 111 |
+
const cuDoubleComplex* b,
|
| 112 |
+
const int* ldb,
|
| 113 |
+
const cuDoubleComplex* beta,
|
| 114 |
+
cuDoubleComplex* c,
|
| 115 |
+
const int* ldc);
|
| 116 |
+
|
| 117 |
+
void sgemm(const char* transa,
|
| 118 |
+
const char* transb,
|
| 119 |
+
const int* m,
|
| 120 |
+
const int* n,
|
| 121 |
+
const int* k,
|
| 122 |
+
const float* alpha,
|
| 123 |
+
const float* a,
|
| 124 |
+
const int* lda,
|
| 125 |
+
const float* b,
|
| 126 |
+
const int* ldb,
|
| 127 |
+
const float* beta,
|
| 128 |
+
float* c,
|
| 129 |
+
const int* ldc);
|
| 130 |
+
|
| 131 |
+
void dgemm(const char* transa,
|
| 132 |
+
const char* transb,
|
| 133 |
+
const int* m,
|
| 134 |
+
const int* n,
|
| 135 |
+
const int* k,
|
| 136 |
+
const double* alpha,
|
| 137 |
+
const double* a,
|
| 138 |
+
const int* lda,
|
| 139 |
+
const double* b,
|
| 140 |
+
const int* ldb,
|
| 141 |
+
const double* beta,
|
| 142 |
+
double* c,
|
| 143 |
+
const int* ldc);
|
| 144 |
+
|
| 145 |
+
void cgemm(const char* transa,
|
| 146 |
+
const char* transb,
|
| 147 |
+
const int* m,
|
| 148 |
+
const int* n,
|
| 149 |
+
const int* k,
|
| 150 |
+
const cuComplex* alpha,
|
| 151 |
+
const cuComplex* a,
|
| 152 |
+
const int* lda,
|
| 153 |
+
const cuComplex* b,
|
| 154 |
+
const int* ldb,
|
| 155 |
+
const cuComplex* beta,
|
| 156 |
+
cuComplex* c,
|
| 157 |
+
const int* ldc);
|
| 158 |
+
|
| 159 |
+
void zgemm(const char* transa,
|
| 160 |
+
const char* transb,
|
| 161 |
+
const int* m,
|
| 162 |
+
const int* n,
|
| 163 |
+
const int* k,
|
| 164 |
+
const cuDoubleComplex* alpha,
|
| 165 |
+
const cuDoubleComplex* a,
|
| 166 |
+
const int* lda,
|
| 167 |
+
const cuDoubleComplex* b,
|
| 168 |
+
const int* ldb,
|
| 169 |
+
const cuDoubleComplex* beta,
|
| 170 |
+
cuDoubleComplex* c,
|
| 171 |
+
const int* ldc);
|
| 172 |
+
|
| 173 |
+
/* SYRK */
|
| 174 |
+
void ssyrk_(const char* uplo,
|
| 175 |
+
const char* trans,
|
| 176 |
+
const int* n,
|
| 177 |
+
const int* k,
|
| 178 |
+
const float* alpha,
|
| 179 |
+
const float* a,
|
| 180 |
+
const int* lda,
|
| 181 |
+
const float* beta,
|
| 182 |
+
float* c,
|
| 183 |
+
const int* ldc);
|
| 184 |
+
|
| 185 |
+
void dsyrk_(const char* uplo,
|
| 186 |
+
const char* trans,
|
| 187 |
+
const int* n,
|
| 188 |
+
const int* k,
|
| 189 |
+
const double* alpha,
|
| 190 |
+
const double* a,
|
| 191 |
+
const int* lda,
|
| 192 |
+
const double* beta,
|
| 193 |
+
double* c,
|
| 194 |
+
const int* ldc);
|
| 195 |
+
|
| 196 |
+
void csyrk_(const char* uplo,
|
| 197 |
+
const char* trans,
|
| 198 |
+
const int* n,
|
| 199 |
+
const int* k,
|
| 200 |
+
const cuComplex* alpha,
|
| 201 |
+
const cuComplex* a,
|
| 202 |
+
const int* lda,
|
| 203 |
+
const cuComplex* beta,
|
| 204 |
+
cuComplex* c,
|
| 205 |
+
const int* ldc);
|
| 206 |
+
|
| 207 |
+
void zsyrk_(const char* uplo,
|
| 208 |
+
const char* trans,
|
| 209 |
+
const int* n,
|
| 210 |
+
const int* k,
|
| 211 |
+
const cuDoubleComplex* alpha,
|
| 212 |
+
const cuDoubleComplex* a,
|
| 213 |
+
const int* lda,
|
| 214 |
+
const cuDoubleComplex* beta,
|
| 215 |
+
cuDoubleComplex* c,
|
| 216 |
+
const int* ldc);
|
| 217 |
+
|
| 218 |
+
void ssyrk(const char* uplo,
|
| 219 |
+
const char* trans,
|
| 220 |
+
const int* n,
|
| 221 |
+
const int* k,
|
| 222 |
+
const float* alpha,
|
| 223 |
+
const float* a,
|
| 224 |
+
const int* lda,
|
| 225 |
+
const float* beta,
|
| 226 |
+
float* c,
|
| 227 |
+
const int* ldc);
|
| 228 |
+
|
| 229 |
+
void dsyrk(const char* uplo,
|
| 230 |
+
const char* trans,
|
| 231 |
+
const int* n,
|
| 232 |
+
const int* k,
|
| 233 |
+
const double* alpha,
|
| 234 |
+
const double* a,
|
| 235 |
+
const int* lda,
|
| 236 |
+
const double* beta,
|
| 237 |
+
double* c,
|
| 238 |
+
const int* ldc);
|
| 239 |
+
|
| 240 |
+
void csyrk(const char* uplo,
|
| 241 |
+
const char* trans,
|
| 242 |
+
const int* n,
|
| 243 |
+
const int* k,
|
| 244 |
+
const cuComplex* alpha,
|
| 245 |
+
const cuComplex* a,
|
| 246 |
+
const int* lda,
|
| 247 |
+
const cuComplex* beta,
|
| 248 |
+
cuComplex* c,
|
| 249 |
+
const int* ldc);
|
| 250 |
+
|
| 251 |
+
void zsyrk(const char* uplo,
|
| 252 |
+
const char* trans,
|
| 253 |
+
const int* n,
|
| 254 |
+
const int* k,
|
| 255 |
+
const cuDoubleComplex* alpha,
|
| 256 |
+
const cuDoubleComplex* a,
|
| 257 |
+
const int* lda,
|
| 258 |
+
const cuDoubleComplex* beta,
|
| 259 |
+
cuDoubleComplex* c,
|
| 260 |
+
const int* ldc);
|
| 261 |
+
|
| 262 |
+
/* HERK */
|
| 263 |
+
void cherk_(const char* uplo,
|
| 264 |
+
const char* trans,
|
| 265 |
+
const int* n,
|
| 266 |
+
const int* k,
|
| 267 |
+
const float* alpha,
|
| 268 |
+
const cuComplex* a,
|
| 269 |
+
const int* lda,
|
| 270 |
+
const float* beta,
|
| 271 |
+
cuComplex* c,
|
| 272 |
+
const int* ldc);
|
| 273 |
+
|
| 274 |
+
void zherk_(const char* uplo,
|
| 275 |
+
const char* trans,
|
| 276 |
+
const int* n,
|
| 277 |
+
const int* k,
|
| 278 |
+
const double* alpha,
|
| 279 |
+
const cuDoubleComplex* a,
|
| 280 |
+
const int* lda,
|
| 281 |
+
const double* beta,
|
| 282 |
+
cuDoubleComplex* c,
|
| 283 |
+
const int* ldc);
|
| 284 |
+
|
| 285 |
+
void cherk(const char* uplo,
|
| 286 |
+
const char* trans,
|
| 287 |
+
const int* n,
|
| 288 |
+
const int* k,
|
| 289 |
+
const float* alpha,
|
| 290 |
+
const cuComplex* a,
|
| 291 |
+
const int* lda,
|
| 292 |
+
const float* beta,
|
| 293 |
+
cuComplex* c,
|
| 294 |
+
const int* ldc);
|
| 295 |
+
|
| 296 |
+
void zherk(const char* uplo,
|
| 297 |
+
const char* trans,
|
| 298 |
+
const int* n,
|
| 299 |
+
const int* k,
|
| 300 |
+
const double* alpha,
|
| 301 |
+
const cuDoubleComplex* a,
|
| 302 |
+
const int* lda,
|
| 303 |
+
const double* beta,
|
| 304 |
+
cuDoubleComplex* c,
|
| 305 |
+
const int* ldc);
|
| 306 |
+
|
| 307 |
+
/* TRSM */
|
| 308 |
+
void strsm_(const char* side,
|
| 309 |
+
const char* uplo,
|
| 310 |
+
const char* transa,
|
| 311 |
+
const char* diag,
|
| 312 |
+
const int* m,
|
| 313 |
+
const int* n,
|
| 314 |
+
const float* alpha,
|
| 315 |
+
const float* a,
|
| 316 |
+
const int* lda,
|
| 317 |
+
float* b,
|
| 318 |
+
const int* ldb);
|
| 319 |
+
|
| 320 |
+
void dtrsm_(const char* side,
|
| 321 |
+
const char* uplo,
|
| 322 |
+
const char* transa,
|
| 323 |
+
const char* diag,
|
| 324 |
+
const int* m,
|
| 325 |
+
const int* n,
|
| 326 |
+
const double* alpha,
|
| 327 |
+
const double* a,
|
| 328 |
+
const int* lda,
|
| 329 |
+
double* b,
|
| 330 |
+
const int* ldb);
|
| 331 |
+
|
| 332 |
+
void ctrsm_(const char* side,
|
| 333 |
+
const char* uplo,
|
| 334 |
+
const char* transa,
|
| 335 |
+
const char* diag,
|
| 336 |
+
const int* m,
|
| 337 |
+
const int* n,
|
| 338 |
+
const cuComplex* alpha,
|
| 339 |
+
const cuComplex* a,
|
| 340 |
+
const int* lda,
|
| 341 |
+
cuComplex* b,
|
| 342 |
+
const int* ldb);
|
| 343 |
+
|
| 344 |
+
void ztrsm_(const char* side,
|
| 345 |
+
const char* uplo,
|
| 346 |
+
const char* transa,
|
| 347 |
+
const char* diag,
|
| 348 |
+
const int* m,
|
| 349 |
+
const int* n,
|
| 350 |
+
const cuDoubleComplex* alpha,
|
| 351 |
+
const cuDoubleComplex* a,
|
| 352 |
+
const int* lda,
|
| 353 |
+
cuDoubleComplex* b,
|
| 354 |
+
const int* ldb);
|
| 355 |
+
|
| 356 |
+
void strsm(const char* side,
|
| 357 |
+
const char* uplo,
|
| 358 |
+
const char* transa,
|
| 359 |
+
const char* diag,
|
| 360 |
+
const int* m,
|
| 361 |
+
const int* n,
|
| 362 |
+
const float* alpha,
|
| 363 |
+
const float* a,
|
| 364 |
+
const int* lda,
|
| 365 |
+
float* b,
|
| 366 |
+
const int* ldb);
|
| 367 |
+
|
| 368 |
+
void dtrsm(const char* side,
|
| 369 |
+
const char* uplo,
|
| 370 |
+
const char* transa,
|
| 371 |
+
const char* diag,
|
| 372 |
+
const int* m,
|
| 373 |
+
const int* n,
|
| 374 |
+
const double* alpha,
|
| 375 |
+
const double* a,
|
| 376 |
+
const int* lda,
|
| 377 |
+
double* b,
|
| 378 |
+
const int* ldb);
|
| 379 |
+
|
| 380 |
+
void ctrsm(const char* side,
|
| 381 |
+
const char* uplo,
|
| 382 |
+
const char* transa,
|
| 383 |
+
const char* diag,
|
| 384 |
+
const int* m,
|
| 385 |
+
const int* n,
|
| 386 |
+
const cuComplex* alpha,
|
| 387 |
+
const cuComplex* a,
|
| 388 |
+
const int* lda,
|
| 389 |
+
cuComplex* b,
|
| 390 |
+
const int* ldb);
|
| 391 |
+
|
| 392 |
+
void ztrsm(const char* side,
|
| 393 |
+
const char* uplo,
|
| 394 |
+
const char* transa,
|
| 395 |
+
const char* diag,
|
| 396 |
+
const int* m,
|
| 397 |
+
const int* n,
|
| 398 |
+
const cuDoubleComplex* alpha,
|
| 399 |
+
const cuDoubleComplex* a,
|
| 400 |
+
const int* lda,
|
| 401 |
+
cuDoubleComplex* b,
|
| 402 |
+
const int* ldb);
|
| 403 |
+
|
| 404 |
+
/* SYMM */
|
| 405 |
+
void ssymm_(const char* side,
|
| 406 |
+
const char* uplo,
|
| 407 |
+
const int* m,
|
| 408 |
+
const int* n,
|
| 409 |
+
const float* alpha,
|
| 410 |
+
const float* a,
|
| 411 |
+
const int* lda,
|
| 412 |
+
const float* b,
|
| 413 |
+
const int* ldb,
|
| 414 |
+
const float* beta,
|
| 415 |
+
float* c,
|
| 416 |
+
const int* ldc);
|
| 417 |
+
|
| 418 |
+
void dsymm_(const char* side,
|
| 419 |
+
const char* uplo,
|
| 420 |
+
const int* m,
|
| 421 |
+
const int* n,
|
| 422 |
+
const double* alpha,
|
| 423 |
+
const double* a,
|
| 424 |
+
const int* lda,
|
| 425 |
+
const double* b,
|
| 426 |
+
const int* ldb,
|
| 427 |
+
const double* beta,
|
| 428 |
+
double* c,
|
| 429 |
+
const int* ldc);
|
| 430 |
+
|
| 431 |
+
void csymm_(const char* side,
|
| 432 |
+
const char* uplo,
|
| 433 |
+
const int* m,
|
| 434 |
+
const int* n,
|
| 435 |
+
const cuComplex* alpha,
|
| 436 |
+
const cuComplex* a,
|
| 437 |
+
const int* lda,
|
| 438 |
+
const cuComplex* b,
|
| 439 |
+
const int* ldb,
|
| 440 |
+
const cuComplex* beta,
|
| 441 |
+
cuComplex* c,
|
| 442 |
+
const int* ldc);
|
| 443 |
+
|
| 444 |
+
void zsymm_(const char* side,
|
| 445 |
+
const char* uplo,
|
| 446 |
+
const int* m,
|
| 447 |
+
const int* n,
|
| 448 |
+
const cuDoubleComplex* alpha,
|
| 449 |
+
const cuDoubleComplex* a,
|
| 450 |
+
const int* lda,
|
| 451 |
+
const cuDoubleComplex* b,
|
| 452 |
+
const int* ldb,
|
| 453 |
+
const cuDoubleComplex* beta,
|
| 454 |
+
cuDoubleComplex* c,
|
| 455 |
+
const int* ldc);
|
| 456 |
+
|
| 457 |
+
void ssymm(const char* side,
|
| 458 |
+
const char* uplo,
|
| 459 |
+
const int* m,
|
| 460 |
+
const int* n,
|
| 461 |
+
const float* alpha,
|
| 462 |
+
const float* a,
|
| 463 |
+
const int* lda,
|
| 464 |
+
const float* b,
|
| 465 |
+
const int* ldb,
|
| 466 |
+
const float* beta,
|
| 467 |
+
float* c,
|
| 468 |
+
const int* ldc);
|
| 469 |
+
|
| 470 |
+
void dsymm(const char* side,
|
| 471 |
+
const char* uplo,
|
| 472 |
+
const int* m,
|
| 473 |
+
const int* n,
|
| 474 |
+
const double* alpha,
|
| 475 |
+
const double* a,
|
| 476 |
+
const int* lda,
|
| 477 |
+
const double* b,
|
| 478 |
+
const int* ldb,
|
| 479 |
+
const double* beta,
|
| 480 |
+
double* c,
|
| 481 |
+
const int* ldc);
|
| 482 |
+
|
| 483 |
+
void csymm(const char* side,
|
| 484 |
+
const char* uplo,
|
| 485 |
+
const int* m,
|
| 486 |
+
const int* n,
|
| 487 |
+
const cuComplex* alpha,
|
| 488 |
+
const cuComplex* a,
|
| 489 |
+
const int* lda,
|
| 490 |
+
const cuComplex* b,
|
| 491 |
+
const int* ldb,
|
| 492 |
+
const cuComplex* beta,
|
| 493 |
+
cuComplex* c,
|
| 494 |
+
const int* ldc);
|
| 495 |
+
|
| 496 |
+
void zsymm(const char* side,
|
| 497 |
+
const char* uplo,
|
| 498 |
+
const int* m,
|
| 499 |
+
const int* n,
|
| 500 |
+
const cuDoubleComplex* alpha,
|
| 501 |
+
const cuDoubleComplex* a,
|
| 502 |
+
const int* lda,
|
| 503 |
+
const cuDoubleComplex* b,
|
| 504 |
+
const int* ldb,
|
| 505 |
+
const cuDoubleComplex* beta,
|
| 506 |
+
cuDoubleComplex* c,
|
| 507 |
+
const int* ldc);
|
| 508 |
+
|
| 509 |
+
/* HEMM */
|
| 510 |
+
void chemm_(const char* side,
|
| 511 |
+
const char* uplo,
|
| 512 |
+
const int* m,
|
| 513 |
+
const int* n,
|
| 514 |
+
const cuComplex* alpha,
|
| 515 |
+
const cuComplex* a,
|
| 516 |
+
const int* lda,
|
| 517 |
+
const cuComplex* b,
|
| 518 |
+
const int* ldb,
|
| 519 |
+
const cuComplex* beta,
|
| 520 |
+
cuComplex* c,
|
| 521 |
+
const int* ldc);
|
| 522 |
+
|
| 523 |
+
void zhemm_(const char* side,
|
| 524 |
+
const char* uplo,
|
| 525 |
+
const int* m,
|
| 526 |
+
const int* n,
|
| 527 |
+
const cuDoubleComplex* alpha,
|
| 528 |
+
const cuDoubleComplex* a,
|
| 529 |
+
const int* lda,
|
| 530 |
+
const cuDoubleComplex* b,
|
| 531 |
+
const int* ldb,
|
| 532 |
+
const cuDoubleComplex* beta,
|
| 533 |
+
cuDoubleComplex* c,
|
| 534 |
+
const int* ldc);
|
| 535 |
+
|
| 536 |
+
/* HEMM with no underscore*/
|
| 537 |
+
void chemm(const char* side,
|
| 538 |
+
const char* uplo,
|
| 539 |
+
const int* m,
|
| 540 |
+
const int* n,
|
| 541 |
+
const cuComplex* alpha,
|
| 542 |
+
const cuComplex* a,
|
| 543 |
+
const int* lda,
|
| 544 |
+
const cuComplex* b,
|
| 545 |
+
const int* ldb,
|
| 546 |
+
const cuComplex* beta,
|
| 547 |
+
cuComplex* c,
|
| 548 |
+
const int* ldc);
|
| 549 |
+
|
| 550 |
+
void zhemm(const char* side,
|
| 551 |
+
const char* uplo,
|
| 552 |
+
const int* m,
|
| 553 |
+
const int* n,
|
| 554 |
+
const cuDoubleComplex* alpha,
|
| 555 |
+
const cuDoubleComplex* a,
|
| 556 |
+
const int* lda,
|
| 557 |
+
const cuDoubleComplex* b,
|
| 558 |
+
const int* ldb,
|
| 559 |
+
const cuDoubleComplex* beta,
|
| 560 |
+
cuDoubleComplex* c,
|
| 561 |
+
const int* ldc);
|
| 562 |
+
|
| 563 |
+
/* SYR2K */
|
| 564 |
+
void ssyr2k_(const char* uplo,
|
| 565 |
+
const char* trans,
|
| 566 |
+
const int* n,
|
| 567 |
+
const int* k,
|
| 568 |
+
const float* alpha,
|
| 569 |
+
const float* a,
|
| 570 |
+
const int* lda,
|
| 571 |
+
const float* b,
|
| 572 |
+
const int* ldb,
|
| 573 |
+
const float* beta,
|
| 574 |
+
float* c,
|
| 575 |
+
const int* ldc);
|
| 576 |
+
|
| 577 |
+
void dsyr2k_(const char* uplo,
|
| 578 |
+
const char* trans,
|
| 579 |
+
const int* n,
|
| 580 |
+
const int* k,
|
| 581 |
+
const double* alpha,
|
| 582 |
+
const double* a,
|
| 583 |
+
const int* lda,
|
| 584 |
+
const double* b,
|
| 585 |
+
const int* ldb,
|
| 586 |
+
const double* beta,
|
| 587 |
+
double* c,
|
| 588 |
+
const int* ldc);
|
| 589 |
+
|
| 590 |
+
void csyr2k_(const char* uplo,
|
| 591 |
+
const char* trans,
|
| 592 |
+
const int* n,
|
| 593 |
+
const int* k,
|
| 594 |
+
const cuComplex* alpha,
|
| 595 |
+
const cuComplex* a,
|
| 596 |
+
const int* lda,
|
| 597 |
+
const cuComplex* b,
|
| 598 |
+
const int* ldb,
|
| 599 |
+
const cuComplex* beta,
|
| 600 |
+
cuComplex* c,
|
| 601 |
+
const int* ldc);
|
| 602 |
+
|
| 603 |
+
void zsyr2k_(const char* uplo,
|
| 604 |
+
const char* trans,
|
| 605 |
+
const int* n,
|
| 606 |
+
const int* k,
|
| 607 |
+
const cuDoubleComplex* alpha,
|
| 608 |
+
const cuDoubleComplex* a,
|
| 609 |
+
const int* lda,
|
| 610 |
+
const cuDoubleComplex* b,
|
| 611 |
+
const int* ldb,
|
| 612 |
+
const cuDoubleComplex* beta,
|
| 613 |
+
cuDoubleComplex* c,
|
| 614 |
+
const int* ldc);
|
| 615 |
+
|
| 616 |
+
/* SYR2K no_underscore*/
|
| 617 |
+
void ssyr2k(const char* uplo,
|
| 618 |
+
const char* trans,
|
| 619 |
+
const int* n,
|
| 620 |
+
const int* k,
|
| 621 |
+
const float* alpha,
|
| 622 |
+
const float* a,
|
| 623 |
+
const int* lda,
|
| 624 |
+
const float* b,
|
| 625 |
+
const int* ldb,
|
| 626 |
+
const float* beta,
|
| 627 |
+
float* c,
|
| 628 |
+
const int* ldc);
|
| 629 |
+
|
| 630 |
+
void dsyr2k(const char* uplo,
|
| 631 |
+
const char* trans,
|
| 632 |
+
const int* n,
|
| 633 |
+
const int* k,
|
| 634 |
+
const double* alpha,
|
| 635 |
+
const double* a,
|
| 636 |
+
const int* lda,
|
| 637 |
+
const double* b,
|
| 638 |
+
const int* ldb,
|
| 639 |
+
const double* beta,
|
| 640 |
+
double* c,
|
| 641 |
+
const int* ldc);
|
| 642 |
+
|
| 643 |
+
void csyr2k(const char* uplo,
|
| 644 |
+
const char* trans,
|
| 645 |
+
const int* n,
|
| 646 |
+
const int* k,
|
| 647 |
+
const cuComplex* alpha,
|
| 648 |
+
const cuComplex* a,
|
| 649 |
+
const int* lda,
|
| 650 |
+
const cuComplex* b,
|
| 651 |
+
const int* ldb,
|
| 652 |
+
const cuComplex* beta,
|
| 653 |
+
cuComplex* c,
|
| 654 |
+
const int* ldc);
|
| 655 |
+
|
| 656 |
+
void zsyr2k(const char* uplo,
|
| 657 |
+
const char* trans,
|
| 658 |
+
const int* n,
|
| 659 |
+
const int* k,
|
| 660 |
+
const cuDoubleComplex* alpha,
|
| 661 |
+
const cuDoubleComplex* a,
|
| 662 |
+
const int* lda,
|
| 663 |
+
const cuDoubleComplex* b,
|
| 664 |
+
const int* ldb,
|
| 665 |
+
const cuDoubleComplex* beta,
|
| 666 |
+
cuDoubleComplex* c,
|
| 667 |
+
const int* ldc);
|
| 668 |
+
|
| 669 |
+
/* HERK */
|
| 670 |
+
void cher2k_(const char* uplo,
|
| 671 |
+
const char* trans,
|
| 672 |
+
const int* n,
|
| 673 |
+
const int* k,
|
| 674 |
+
const cuComplex* alpha,
|
| 675 |
+
const cuComplex* a,
|
| 676 |
+
const int* lda,
|
| 677 |
+
const cuComplex* b,
|
| 678 |
+
const int* ldb,
|
| 679 |
+
const float* beta,
|
| 680 |
+
cuComplex* c,
|
| 681 |
+
const int* ldc);
|
| 682 |
+
|
| 683 |
+
void zher2k_(const char* uplo,
|
| 684 |
+
const char* trans,
|
| 685 |
+
const int* n,
|
| 686 |
+
const int* k,
|
| 687 |
+
const cuDoubleComplex* alpha,
|
| 688 |
+
const cuDoubleComplex* a,
|
| 689 |
+
const int* lda,
|
| 690 |
+
const cuDoubleComplex* b,
|
| 691 |
+
const int* ldb,
|
| 692 |
+
const double* beta,
|
| 693 |
+
cuDoubleComplex* c,
|
| 694 |
+
const int* ldc);
|
| 695 |
+
|
| 696 |
+
/* HER2K with no underscore */
|
| 697 |
+
void cher2k(const char* uplo,
|
| 698 |
+
const char* trans,
|
| 699 |
+
const int* n,
|
| 700 |
+
const int* k,
|
| 701 |
+
const cuComplex* alpha,
|
| 702 |
+
const cuComplex* a,
|
| 703 |
+
const int* lda,
|
| 704 |
+
const cuComplex* b,
|
| 705 |
+
const int* ldb,
|
| 706 |
+
const float* beta,
|
| 707 |
+
cuComplex* c,
|
| 708 |
+
const int* ldc);
|
| 709 |
+
|
| 710 |
+
void zher2k(const char* uplo,
|
| 711 |
+
const char* trans,
|
| 712 |
+
const int* n,
|
| 713 |
+
const int* k,
|
| 714 |
+
const cuDoubleComplex* alpha,
|
| 715 |
+
const cuDoubleComplex* a,
|
| 716 |
+
const int* lda,
|
| 717 |
+
const cuDoubleComplex* b,
|
| 718 |
+
const int* ldb,
|
| 719 |
+
const double* beta,
|
| 720 |
+
cuDoubleComplex* c,
|
| 721 |
+
const int* ldc);
|
| 722 |
+
|
| 723 |
+
/* TRMM */
|
| 724 |
+
void strmm_(const char* side,
|
| 725 |
+
const char* uplo,
|
| 726 |
+
const char* transa,
|
| 727 |
+
const char* diag,
|
| 728 |
+
const int* m,
|
| 729 |
+
const int* n,
|
| 730 |
+
const float* alpha,
|
| 731 |
+
const float* a,
|
| 732 |
+
const int* lda,
|
| 733 |
+
float* b,
|
| 734 |
+
const int* ldb);
|
| 735 |
+
|
| 736 |
+
void dtrmm_(const char* side,
|
| 737 |
+
const char* uplo,
|
| 738 |
+
const char* transa,
|
| 739 |
+
const char* diag,
|
| 740 |
+
const int* m,
|
| 741 |
+
const int* n,
|
| 742 |
+
const double* alpha,
|
| 743 |
+
const double* a,
|
| 744 |
+
const int* lda,
|
| 745 |
+
double* b,
|
| 746 |
+
const int* ldb);
|
| 747 |
+
|
| 748 |
+
void ctrmm_(const char* side,
|
| 749 |
+
const char* uplo,
|
| 750 |
+
const char* transa,
|
| 751 |
+
const char* diag,
|
| 752 |
+
const int* m,
|
| 753 |
+
const int* n,
|
| 754 |
+
const cuComplex* alpha,
|
| 755 |
+
const cuComplex* a,
|
| 756 |
+
const int* lda,
|
| 757 |
+
cuComplex* b,
|
| 758 |
+
const int* ldb);
|
| 759 |
+
|
| 760 |
+
void ztrmm_(const char* side,
|
| 761 |
+
const char* uplo,
|
| 762 |
+
const char* transa,
|
| 763 |
+
const char* diag,
|
| 764 |
+
const int* m,
|
| 765 |
+
const int* n,
|
| 766 |
+
const cuDoubleComplex* alpha,
|
| 767 |
+
const cuDoubleComplex* a,
|
| 768 |
+
const int* lda,
|
| 769 |
+
cuDoubleComplex* b,
|
| 770 |
+
const int* ldb);
|
| 771 |
+
|
| 772 |
+
void strmm(const char* side,
|
| 773 |
+
const char* uplo,
|
| 774 |
+
const char* transa,
|
| 775 |
+
const char* diag,
|
| 776 |
+
const int* m,
|
| 777 |
+
const int* n,
|
| 778 |
+
const float* alpha,
|
| 779 |
+
const float* a,
|
| 780 |
+
const int* lda,
|
| 781 |
+
float* b,
|
| 782 |
+
const int* ldb);
|
| 783 |
+
|
| 784 |
+
void dtrmm(const char* side,
|
| 785 |
+
const char* uplo,
|
| 786 |
+
const char* transa,
|
| 787 |
+
const char* diag,
|
| 788 |
+
const int* m,
|
| 789 |
+
const int* n,
|
| 790 |
+
const double* alpha,
|
| 791 |
+
const double* a,
|
| 792 |
+
const int* lda,
|
| 793 |
+
double* b,
|
| 794 |
+
const int* ldb);
|
| 795 |
+
|
| 796 |
+
void ctrmm(const char* side,
|
| 797 |
+
const char* uplo,
|
| 798 |
+
const char* transa,
|
| 799 |
+
const char* diag,
|
| 800 |
+
const int* m,
|
| 801 |
+
const int* n,
|
| 802 |
+
const cuComplex* alpha,
|
| 803 |
+
const cuComplex* a,
|
| 804 |
+
const int* lda,
|
| 805 |
+
cuComplex* b,
|
| 806 |
+
const int* ldb);
|
| 807 |
+
|
| 808 |
+
void ztrmm(const char* side,
|
| 809 |
+
const char* uplo,
|
| 810 |
+
const char* transa,
|
| 811 |
+
const char* diag,
|
| 812 |
+
const int* m,
|
| 813 |
+
const int* n,
|
| 814 |
+
const cuDoubleComplex* alpha,
|
| 815 |
+
const cuDoubleComplex* a,
|
| 816 |
+
const int* lda,
|
| 817 |
+
cuDoubleComplex* b,
|
| 818 |
+
const int* ldb);
|
| 819 |
+
|
| 820 |
+
#if defined(__cplusplus)
|
| 821 |
+
}
|
| 822 |
+
#endif /* __cplusplus */
|
| 823 |
+
|
| 824 |
+
#endif /* !defined(NVBLAS_H_) */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (223 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_cupti/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (223 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (231 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
ADDED
|
@@ -0,0 +1,1141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
//
|
| 4 |
+
// Copyright (c) 2014-2024, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
//
|
| 6 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
// and proprietary rights in and to this software, related documentation
|
| 8 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
// distribution of this software and related documentation without an express
|
| 10 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
//
|
| 12 |
+
// NVIDIA_COPYRIGHT_END
|
| 13 |
+
//
|
| 14 |
+
|
| 15 |
+
#ifndef __NVRTC_H__
|
| 16 |
+
#define __NVRTC_H__
|
| 17 |
+
|
| 18 |
+
#ifdef __cplusplus
|
| 19 |
+
extern "C" {
|
| 20 |
+
#endif /* __cplusplus */
|
| 21 |
+
|
| 22 |
+
#include <stdlib.h>
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
/*************************************************************************//**
|
| 26 |
+
*
|
| 27 |
+
* \defgroup error Error Handling
|
| 28 |
+
*
|
| 29 |
+
* NVRTC defines the following enumeration type and function for API call
|
| 30 |
+
* error handling.
|
| 31 |
+
*
|
| 32 |
+
****************************************************************************/
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
/**
|
| 36 |
+
* \ingroup error
|
| 37 |
+
* \brief The enumerated type nvrtcResult defines API call result codes.
|
| 38 |
+
* NVRTC API functions return nvrtcResult to indicate the call
|
| 39 |
+
* result.
|
| 40 |
+
*/
|
| 41 |
+
typedef enum {
|
| 42 |
+
NVRTC_SUCCESS = 0,
|
| 43 |
+
NVRTC_ERROR_OUT_OF_MEMORY = 1,
|
| 44 |
+
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
|
| 45 |
+
NVRTC_ERROR_INVALID_INPUT = 3,
|
| 46 |
+
NVRTC_ERROR_INVALID_PROGRAM = 4,
|
| 47 |
+
NVRTC_ERROR_INVALID_OPTION = 5,
|
| 48 |
+
NVRTC_ERROR_COMPILATION = 6,
|
| 49 |
+
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
|
| 50 |
+
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
|
| 51 |
+
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
|
| 52 |
+
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
|
| 53 |
+
NVRTC_ERROR_INTERNAL_ERROR = 11,
|
| 54 |
+
NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12,
|
| 55 |
+
NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED = 13,
|
| 56 |
+
NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED = 14,
|
| 57 |
+
NVRTC_ERROR_PCH_CREATE = 15,
|
| 58 |
+
NVRTC_ERROR_CANCELLED = 16
|
| 59 |
+
} nvrtcResult;
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
/**
|
| 63 |
+
* \ingroup error
|
| 64 |
+
* \brief nvrtcGetErrorString is a helper function that returns a string
|
| 65 |
+
* describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
|
| 66 |
+
* \c "NVRTC_SUCCESS".
|
| 67 |
+
* For unrecognized enumeration values, it returns
|
| 68 |
+
* \c "NVRTC_ERROR unknown".
|
| 69 |
+
*
|
| 70 |
+
* \param [in] result CUDA Runtime Compilation API result code.
|
| 71 |
+
* \return Message string for the given #nvrtcResult code.
|
| 72 |
+
*/
|
| 73 |
+
const char *nvrtcGetErrorString(nvrtcResult result);
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
/*************************************************************************//**
|
| 77 |
+
*
|
| 78 |
+
* \defgroup query General Information Query
|
| 79 |
+
*
|
| 80 |
+
* NVRTC defines the following function for general information query.
|
| 81 |
+
*
|
| 82 |
+
****************************************************************************/
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
/**
|
| 86 |
+
* \ingroup query
|
| 87 |
+
* \brief nvrtcVersion sets the output parameters \p major and \p minor
|
| 88 |
+
* with the CUDA Runtime Compilation version number.
|
| 89 |
+
*
|
| 90 |
+
* \param [out] major CUDA Runtime Compilation major version number.
|
| 91 |
+
* \param [out] minor CUDA Runtime Compilation minor version number.
|
| 92 |
+
* \return
|
| 93 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 94 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 95 |
+
*
|
| 96 |
+
*/
|
| 97 |
+
nvrtcResult nvrtcVersion(int *major, int *minor);
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
/**
|
| 101 |
+
* \ingroup query
|
| 102 |
+
* \brief nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
|
| 103 |
+
* with the number of architectures supported by NVRTC. This can
|
| 104 |
+
* then be used to pass an array to ::nvrtcGetSupportedArchs to
|
| 105 |
+
* get the supported architectures.
|
| 106 |
+
*
|
| 107 |
+
* \param [out] numArchs number of supported architectures.
|
| 108 |
+
* \return
|
| 109 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 110 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 111 |
+
*
|
| 112 |
+
* see ::nvrtcGetSupportedArchs
|
| 113 |
+
*/
|
| 114 |
+
nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
/**
|
| 118 |
+
* \ingroup query
|
| 119 |
+
* \brief nvrtcGetSupportedArchs populates the array passed via the output parameter
|
| 120 |
+
* \p supportedArchs with the architectures supported by NVRTC. The array is
|
| 121 |
+
* sorted in the ascending order. The size of the array to be passed can be
|
| 122 |
+
* determined using ::nvrtcGetNumSupportedArchs.
|
| 123 |
+
*
|
| 124 |
+
* \param [out] supportedArchs sorted array of supported architectures.
|
| 125 |
+
* \return
|
| 126 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 127 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 128 |
+
*
|
| 129 |
+
* see ::nvrtcGetNumSupportedArchs
|
| 130 |
+
*/
|
| 131 |
+
nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
/*************************************************************************//**
|
| 135 |
+
*
|
| 136 |
+
* \defgroup compilation Compilation
|
| 137 |
+
*
|
| 138 |
+
* NVRTC defines the following type and functions for actual compilation.
|
| 139 |
+
*
|
| 140 |
+
****************************************************************************/
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
/**
|
| 144 |
+
* \ingroup compilation
|
| 145 |
+
* \brief nvrtcProgram is the unit of compilation, and an opaque handle for
|
| 146 |
+
* a program.
|
| 147 |
+
*
|
| 148 |
+
* To compile a CUDA program string, an instance of nvrtcProgram must be
|
| 149 |
+
* created first with ::nvrtcCreateProgram, then compiled with
|
| 150 |
+
* ::nvrtcCompileProgram.
|
| 151 |
+
*/
|
| 152 |
+
typedef struct _nvrtcProgram *nvrtcProgram;
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
/**
|
| 156 |
+
* \ingroup compilation
|
| 157 |
+
* \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
|
| 158 |
+
* given input parameters, and sets the output parameter \p prog with
|
| 159 |
+
* it.
|
| 160 |
+
*
|
| 161 |
+
* \param [out] prog CUDA Runtime Compilation program.
|
| 162 |
+
* \param [in] src CUDA program source.
|
| 163 |
+
* \param [in] name CUDA program name.\n
|
| 164 |
+
* \p name can be \c NULL; \c "default_program" is
|
| 165 |
+
* used when \p name is \c NULL or "".
|
| 166 |
+
* \param [in] numHeaders Number of headers used.\n
|
| 167 |
+
* \p numHeaders must be greater than or equal to 0.
|
| 168 |
+
* \param [in] headers Sources of the headers.\n
|
| 169 |
+
* \p headers can be \c NULL when \p numHeaders is
|
| 170 |
+
* 0.
|
| 171 |
+
* \param [in] includeNames Name of each header by which they can be
|
| 172 |
+
* included in the CUDA program source.\n
|
| 173 |
+
* \p includeNames can be \c NULL when \p numHeaders
|
| 174 |
+
* is 0. These headers must be included with the exact
|
| 175 |
+
* names specified here.
|
| 176 |
+
* \return
|
| 177 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 178 |
+
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
|
| 179 |
+
* - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
|
| 180 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 181 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 182 |
+
*
|
| 183 |
+
* \see ::nvrtcDestroyProgram
|
| 184 |
+
*/
|
| 185 |
+
nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
| 186 |
+
const char *src,
|
| 187 |
+
const char *name,
|
| 188 |
+
int numHeaders,
|
| 189 |
+
const char * const *headers,
|
| 190 |
+
const char * const *includeNames);
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
/**
|
| 194 |
+
* \ingroup compilation
|
| 195 |
+
* \brief nvrtcDestroyProgram destroys the given program.
|
| 196 |
+
*
|
| 197 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 198 |
+
* \return
|
| 199 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 200 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 201 |
+
*
|
| 202 |
+
* \see ::nvrtcCreateProgram
|
| 203 |
+
*/
|
| 204 |
+
nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
/**
|
| 208 |
+
* \ingroup compilation
|
| 209 |
+
* \brief nvrtcCompileProgram compiles the given program.
|
| 210 |
+
*
|
| 211 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 212 |
+
* \param [in] numOptions Number of compiler options passed.
|
| 213 |
+
* \param [in] options Compiler options in the form of C string array.\n
|
| 214 |
+
* \p options can be \c NULL when \p numOptions is 0.
|
| 215 |
+
*
|
| 216 |
+
* \return
|
| 217 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 218 |
+
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
|
| 219 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 220 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 221 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
|
| 222 |
+
* - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
|
| 223 |
+
* - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
|
| 224 |
+
* - \link #nvrtcResult NVRTC_ERROR_TIME_FILE_WRITE_FAILED \endlink
|
| 225 |
+
* - \link #nvrtcResult NVRTC_ERROR_CANCELLED \endlink
|
| 226 |
+
*
|
| 227 |
+
* It supports compile options listed in \ref options.
|
| 228 |
+
*/
|
| 229 |
+
nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
|
| 230 |
+
int numOptions, const char * const *options);
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
/**
|
| 234 |
+
* \ingroup compilation
|
| 235 |
+
* \brief nvrtcGetPTXSize sets the value of \p ptxSizeRet with the size of the PTX
|
| 236 |
+
* generated by the previous compilation of \p prog (including the
|
| 237 |
+
* trailing \c NULL).
|
| 238 |
+
*
|
| 239 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 240 |
+
* \param [out] ptxSizeRet Size of the generated PTX (including the trailing
|
| 241 |
+
* \c NULL).
|
| 242 |
+
* \return
|
| 243 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 244 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 245 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 246 |
+
*
|
| 247 |
+
* \see ::nvrtcGetPTX
|
| 248 |
+
*/
|
| 249 |
+
nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
/**
|
| 253 |
+
* \ingroup compilation
|
| 254 |
+
* \brief nvrtcGetPTX stores the PTX generated by the previous compilation
|
| 255 |
+
* of \p prog in the memory pointed by \p ptx.
|
| 256 |
+
*
|
| 257 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 258 |
+
* \param [out] ptx Compiled result.
|
| 259 |
+
* \return
|
| 260 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 261 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 262 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 263 |
+
*
|
| 264 |
+
* \see ::nvrtcGetPTXSize
|
| 265 |
+
*/
|
| 266 |
+
nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
/**
|
| 270 |
+
* \ingroup compilation
|
| 271 |
+
* \brief nvrtcGetCUBINSize sets the value of \p cubinSizeRet with the size of the cubin
|
| 272 |
+
* generated by the previous compilation of \p prog. The value of
|
| 273 |
+
* cubinSizeRet is set to 0 if the value specified to \c -arch is a
|
| 274 |
+
* virtual architecture instead of an actual architecture.
|
| 275 |
+
*
|
| 276 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 277 |
+
* \param [out] cubinSizeRet Size of the generated cubin.
|
| 278 |
+
* \return
|
| 279 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 280 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 281 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 282 |
+
*
|
| 283 |
+
* \see ::nvrtcGetCUBIN
|
| 284 |
+
*/
|
| 285 |
+
nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
/**
|
| 289 |
+
* \ingroup compilation
|
| 290 |
+
* \brief nvrtcGetCUBIN stores the cubin generated by the previous compilation
|
| 291 |
+
* of \p prog in the memory pointed by \p cubin. No cubin is available
|
| 292 |
+
* if the value specified to \c -arch is a virtual architecture instead
|
| 293 |
+
* of an actual architecture.
|
| 294 |
+
*
|
| 295 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 296 |
+
* \param [out] cubin Compiled and assembled result.
|
| 297 |
+
* \return
|
| 298 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 299 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 300 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 301 |
+
*
|
| 302 |
+
* \see ::nvrtcGetCUBINSize
|
| 303 |
+
*/
|
| 304 |
+
nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
#if defined(_WIN32)
|
| 308 |
+
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
|
| 309 |
+
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
|
| 310 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated))
|
| 311 |
+
#elif (defined(__GNUC__))
|
| 312 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
|
| 313 |
+
#else
|
| 314 |
+
# define __DEPRECATED__(msg)
|
| 315 |
+
#endif
|
| 316 |
+
|
| 317 |
+
/**
|
| 318 |
+
* \ingroup compilation
|
| 319 |
+
* \brief
|
| 320 |
+
* DEPRECATION NOTICE: This function will be removed in a future release. Please use
|
| 321 |
+
* nvrtcGetLTOIRSize (and nvrtcGetLTOIR) instead.
|
| 322 |
+
*/
|
| 323 |
+
__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIRSize instead")
|
| 324 |
+
nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
|
| 325 |
+
|
| 326 |
+
/**
|
| 327 |
+
* \ingroup compilation
|
| 328 |
+
* \brief
|
| 329 |
+
* DEPRECATION NOTICE: This function will be removed in a future release. Please use
|
| 330 |
+
* nvrtcGetLTOIR (and nvrtcGetLTOIRSize) instead.
|
| 331 |
+
*/
|
| 332 |
+
__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIR instead")
|
| 333 |
+
nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
|
| 334 |
+
|
| 335 |
+
#undef __DEPRECATED__
|
| 336 |
+
|
| 337 |
+
/**
|
| 338 |
+
* \ingroup compilation
|
| 339 |
+
* \brief nvrtcGetLTOIRSize sets the value of \p LTOIRSizeRet with the size of the LTO IR
|
| 340 |
+
* generated by the previous compilation of \p prog. The value of
|
| 341 |
+
* LTOIRSizeRet is set to 0 if the program was not compiled with
|
| 342 |
+
* \c -dlto.
|
| 343 |
+
*
|
| 344 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 345 |
+
* \param [out] LTOIRSizeRet Size of the generated LTO IR.
|
| 346 |
+
* \return
|
| 347 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 348 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 349 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 350 |
+
*
|
| 351 |
+
* \see ::nvrtcGetLTOIR
|
| 352 |
+
*/
|
| 353 |
+
nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *LTOIRSizeRet);
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
/**
|
| 357 |
+
* \ingroup compilation
|
| 358 |
+
* \brief nvrtcGetLTOIR stores the LTO IR generated by the previous compilation
|
| 359 |
+
* of \p prog in the memory pointed by \p LTOIR. No LTO IR is available
|
| 360 |
+
* if the program was compiled without \c -dlto.
|
| 361 |
+
*
|
| 362 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 363 |
+
* \param [out] LTOIR Compiled result.
|
| 364 |
+
* \return
|
| 365 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 366 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 367 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 368 |
+
*
|
| 369 |
+
* \see ::nvrtcGetLTOIRSize
|
| 370 |
+
*/
|
| 371 |
+
nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *LTOIR);
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
/**
|
| 375 |
+
* \ingroup compilation
|
| 376 |
+
* \brief nvrtcGetOptiXIRSize sets the value of \p optixirSizeRet with the size of the OptiX IR
|
| 377 |
+
* generated by the previous compilation of \p prog. The value of
|
| 378 |
+
* nvrtcGetOptiXIRSize is set to 0 if the program was compiled with
|
| 379 |
+
* options incompatible with OptiX IR generation.
|
| 380 |
+
*
|
| 381 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 382 |
+
* \param [out] optixirSizeRet Size of the generated LTO IR.
|
| 383 |
+
* \return
|
| 384 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 385 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 386 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 387 |
+
*
|
| 388 |
+
* \see ::nvrtcGetOptiXIR
|
| 389 |
+
*/
|
| 390 |
+
nvrtcResult nvrtcGetOptiXIRSize(nvrtcProgram prog, size_t *optixirSizeRet);
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
/**
|
| 394 |
+
* \ingroup compilation
|
| 395 |
+
* \brief nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation
|
| 396 |
+
* of \p prog in the memory pointed by \p optixir. No OptiX IR is available
|
| 397 |
+
* if the program was compiled with options incompatible with OptiX IR generation.
|
| 398 |
+
*
|
| 399 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 400 |
+
* \param [out] optixir Optix IR Compiled result.
|
| 401 |
+
* \return
|
| 402 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 403 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 404 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 405 |
+
*
|
| 406 |
+
* \see ::nvrtcGetOptiXIRSize
|
| 407 |
+
*/
|
| 408 |
+
nvrtcResult nvrtcGetOptiXIR(nvrtcProgram prog, char *optixir);
|
| 409 |
+
|
| 410 |
+
/**
|
| 411 |
+
* \ingroup compilation
|
| 412 |
+
* \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
|
| 413 |
+
* log generated by the previous compilation of \p prog (including the
|
| 414 |
+
* trailing \c NULL).
|
| 415 |
+
*
|
| 416 |
+
* Note that compilation log may be generated with warnings and informative
|
| 417 |
+
* messages, even when the compilation of \p prog succeeds.
|
| 418 |
+
*
|
| 419 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 420 |
+
* \param [out] logSizeRet Size of the compilation log
|
| 421 |
+
* (including the trailing \c NULL).
|
| 422 |
+
* \return
|
| 423 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 424 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 425 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 426 |
+
*
|
| 427 |
+
* \see ::nvrtcGetProgramLog
|
| 428 |
+
*/
|
| 429 |
+
nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
/**
|
| 433 |
+
* \ingroup compilation
|
| 434 |
+
* \brief nvrtcGetProgramLog stores the log generated by the previous
|
| 435 |
+
* compilation of \p prog in the memory pointed by \p log.
|
| 436 |
+
*
|
| 437 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 438 |
+
* \param [out] log Compilation log.
|
| 439 |
+
* \return
|
| 440 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 441 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 442 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 443 |
+
*
|
| 444 |
+
* \see ::nvrtcGetProgramLogSize
|
| 445 |
+
*/
|
| 446 |
+
nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
/**
|
| 450 |
+
* \ingroup compilation
|
| 451 |
+
* \brief nvrtcAddNameExpression notes the given name expression
|
| 452 |
+
* denoting the address of a __global__ function
|
| 453 |
+
* or __device__/__constant__ variable.
|
| 454 |
+
*
|
| 455 |
+
* The identical name expression string must be provided on a subsequent
|
| 456 |
+
* call to nvrtcGetLoweredName to extract the lowered name.
|
| 457 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 458 |
+
* \param [in] name_expression constant expression denoting the address of
|
| 459 |
+
* a __global__ function or __device__/__constant__ variable.
|
| 460 |
+
* \return
|
| 461 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 462 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 463 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 464 |
+
* - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
|
| 465 |
+
*
|
| 466 |
+
* \see ::nvrtcGetLoweredName
|
| 467 |
+
*/
|
| 468 |
+
nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
|
| 469 |
+
const char * const name_expression);
|
| 470 |
+
|
| 471 |
+
/**
|
| 472 |
+
* \ingroup compilation
|
| 473 |
+
* \brief nvrtcGetLoweredName extracts the lowered (mangled) name
|
| 474 |
+
* for a __global__ function or __device__/__constant__ variable,
|
| 475 |
+
* and updates *lowered_name to point to it. The memory containing
|
| 476 |
+
* the name is released when the NVRTC program is destroyed by
|
| 477 |
+
* nvrtcDestroyProgram.
|
| 478 |
+
* The identical name expression must have been previously
|
| 479 |
+
* provided to nvrtcAddNameExpression.
|
| 480 |
+
*
|
| 481 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 482 |
+
* \param [in] name_expression constant expression denoting the address of
|
| 483 |
+
* a __global__ function or __device__/__constant__ variable.
|
| 484 |
+
* \param [out] lowered_name initialized by the function to point to a
|
| 485 |
+
* C string containing the lowered (mangled)
|
| 486 |
+
* name corresponding to the provided name expression.
|
| 487 |
+
* \return
|
| 488 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 489 |
+
* - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
|
| 490 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 491 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 492 |
+
* - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
|
| 493 |
+
*
|
| 494 |
+
* \see ::nvrtcAddNameExpression
|
| 495 |
+
*/
|
| 496 |
+
nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
|
| 497 |
+
const char *const name_expression,
|
| 498 |
+
const char** lowered_name);
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
/*************************************************************************//**
|
| 502 |
+
*
|
| 503 |
+
* \defgroup precompiled_header Precompiled header (PCH) (CUDA 12.8+)
|
| 504 |
+
*
|
| 505 |
+
* NVRTC defines the following function related to PCH. Also see PCH related
|
| 506 |
+
* flags passed to nvrtcCompileProgram.
|
| 507 |
+
****************************************************************************/
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
/**
|
| 511 |
+
* \ingroup precompiled_header
|
| 512 |
+
* \brief retrieve the current size of the PCH Heap.
|
| 513 |
+
*
|
| 514 |
+
* \param [out] ret pointer to location where the size of the PCH Heap
|
| 515 |
+
* will be stored
|
| 516 |
+
* \return
|
| 517 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 518 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 519 |
+
*
|
| 520 |
+
*/
|
| 521 |
+
nvrtcResult nvrtcGetPCHHeapSize(size_t* ret);
|
| 522 |
+
|
| 523 |
+
/**
|
| 524 |
+
* \ingroup precompiled_header
|
| 525 |
+
* \brief set the size of the PCH Heap.
|
| 526 |
+
*
|
| 527 |
+
* \param [in] size requested size of the PCH Heap, in bytes
|
| 528 |
+
*
|
| 529 |
+
* \return
|
| 530 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 531 |
+
*
|
| 532 |
+
* The requested size may be rounded up to a platform dependent
|
| 533 |
+
* alignment (e.g. page size). If the PCH Heap has already been allocated,
|
| 534 |
+
* the heap memory will be freed and a new PCH Heap will be allocated.
|
| 535 |
+
*/
|
| 536 |
+
nvrtcResult nvrtcSetPCHHeapSize(size_t size);
|
| 537 |
+
|
| 538 |
+
/**
|
| 539 |
+
* \ingroup precompiled_header
|
| 540 |
+
* \brief returns the PCH creation status.
|
| 541 |
+
*
|
| 542 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 543 |
+
*
|
| 544 |
+
* \return
|
| 545 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 546 |
+
* - \link #nvrtcResult NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED \endlink
|
| 547 |
+
* - \link #nvrtcResult NVRTC_ERROR_PCH_CREATE \endlink
|
| 548 |
+
* - \link #nvrtcResult NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED \endlink
|
| 549 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 550 |
+
*
|
| 551 |
+
* NVRTC_SUCCESS indicates that the PCH was successfully created.
|
| 552 |
+
* NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED indicates that no PCH creation
|
| 553 |
+
* was attempted, either because PCH functionality was not requested during
|
| 554 |
+
* the preceding nvrtcCompileProgram call, or automatic PCH processing was
|
| 555 |
+
* requested, and compiler chose not to create a PCH file.
|
| 556 |
+
* NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED indicates that a PCH file could
|
| 557 |
+
* potentially have been created, but the compiler ran out space in the PCH
|
| 558 |
+
* heap. In this scenario, the nvrtcGetPCHHeapSizeRequired() can be used to
|
| 559 |
+
* query the required heap size, the heap can be reallocated for this size with
|
| 560 |
+
* nvrtcSetPCHHeapSize() and PCH creation may be reattempted again invoking
|
| 561 |
+
* nvrtcCompileProgram() with a new NVRTC program instance.
|
| 562 |
+
* NVRTC_ERROR_PCH_CREATE indicates that an error condition prevented the
|
| 563 |
+
* PCH file from being created.
|
| 564 |
+
*/
|
| 565 |
+
nvrtcResult nvrtcGetPCHCreateStatus(nvrtcProgram prog);
|
| 566 |
+
|
| 567 |
+
/**
|
| 568 |
+
* \ingroup precompiled_header
|
| 569 |
+
* \brief retrieve the required size of the PCH heap required to compile
|
| 570 |
+
* the given program.
|
| 571 |
+
*
|
| 572 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 573 |
+
* \param [out] size pointer to location where the required size of the PCH Heap
|
| 574 |
+
* will be stored
|
| 575 |
+
*
|
| 576 |
+
* \return
|
| 577 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 578 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 579 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 580 |
+
* The size retrieved using this function is only valid if nvrtcGetPCHCreateStatus()
|
| 581 |
+
* returned NVRTC_SUCCESS or NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED
|
| 582 |
+
*/
|
| 583 |
+
nvrtcResult nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size);
|
| 584 |
+
|
| 585 |
+
/**
|
| 586 |
+
* \ingroup compilation
|
| 587 |
+
* \brief nvrtcSetFlowCallback registers a callback function that the compiler
|
| 588 |
+
* will invoke at different points during a call to nvrtcCompileProgram,
|
| 589 |
+
* and the callback function can decide whether to cancel compilation by
|
| 590 |
+
* returning specific values.
|
| 591 |
+
*
|
| 592 |
+
* The callback function must satisfy the following constraints:
|
| 593 |
+
*
|
| 594 |
+
* (1) Its signature should be:
|
| 595 |
+
* @code
|
| 596 |
+
* int callback(void* param1, void* param2);
|
| 597 |
+
* @endcode
|
| 598 |
+
* When invoking the callback, the compiler will always pass \p payload to
|
| 599 |
+
* param1 so that the callback may make decisions based on \p payload . It'll
|
| 600 |
+
* always pass NULL to param2 for now which is reserved for future extensions.
|
| 601 |
+
*
|
| 602 |
+
* (2) It must return 1 to cancel compilation or 0 to continue.
|
| 603 |
+
* Other return values are reserved for future use.
|
| 604 |
+
*
|
| 605 |
+
* (3) It must return consistent values. Once it returns 1 at one point, it must
|
| 606 |
+
* return 1 in all following invocations during the current nvrtcCompileProgram
|
| 607 |
+
* call in progress.
|
| 608 |
+
*
|
| 609 |
+
* (4) It must be thread-safe.
|
| 610 |
+
*
|
| 611 |
+
* (5) It must not invoke any nvrtc/libnvvm/ptx APIs.
|
| 612 |
+
*
|
| 613 |
+
* \param [in] prog CUDA Runtime Compilation program.
|
| 614 |
+
* \param [in] callback the callback that issues cancellation signal.
|
| 615 |
+
* \param [in] payload to be passed as a parameter when invoking the callback.
|
| 616 |
+
* \return
|
| 617 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 618 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
|
| 619 |
+
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
|
| 620 |
+
*/
|
| 621 |
+
nvrtcResult nvrtcSetFlowCallback(nvrtcProgram prog, int (*callback)(void*, void*), void *payload);
|
| 622 |
+
|
| 623 |
+
/**
|
| 624 |
+
* \defgroup options Supported Compile Options
|
| 625 |
+
*
|
| 626 |
+
* NVRTC supports the compile options below.
|
| 627 |
+
* Option names with two preceding dashs (\c --) are long option names and
|
| 628 |
+
* option names with one preceding dash (\c -) are short option names.
|
| 629 |
+
* Short option names can be used instead of long option names.
|
| 630 |
+
* When a compile option takes an argument, an assignment operator (\c =)
|
| 631 |
+
* is used to separate the compile option argument from the compile option
|
| 632 |
+
* name, e.g., \c "--gpu-architecture=compute_60".
|
| 633 |
+
* Alternatively, the compile option name and the argument can be specified in
|
| 634 |
+
* separate strings without an assignment operator, .e.g,
|
| 635 |
+
* \c "--gpu-architecture" \c "compute_60".
|
| 636 |
+
* Single-character short option names, such as \c -D, \c -U, and \c -I, do
|
| 637 |
+
* not require an assignment operator, and the compile option name and the
|
| 638 |
+
* argument can be present in the same string with or without spaces between
|
| 639 |
+
* them.
|
| 640 |
+
* For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
|
| 641 |
+
* supported.
|
| 642 |
+
*
|
| 643 |
+
* The valid compiler options are:
|
| 644 |
+
*
|
| 645 |
+
* - Compilation targets
|
| 646 |
+
* - \c --gpu-architecture=\<arch\> (\c -arch)
|
| 647 |
+
*
|
| 648 |
+
* Specify the name of the class of GPU architectures for which the
|
| 649 |
+
* input must be compiled.\n
|
| 650 |
+
* - Valid <c>\<arch\></c>s:
|
| 651 |
+
* - \c compute_50
|
| 652 |
+
* - \c compute_52
|
| 653 |
+
* - \c compute_53
|
| 654 |
+
* - \c compute_60
|
| 655 |
+
* - \c compute_61
|
| 656 |
+
* - \c compute_62
|
| 657 |
+
* - \c compute_70
|
| 658 |
+
* - \c compute_72
|
| 659 |
+
* - \c compute_75
|
| 660 |
+
* - \c compute_80
|
| 661 |
+
* - \c compute_87
|
| 662 |
+
* - \c compute_89
|
| 663 |
+
* - \c compute_90
|
| 664 |
+
* - \c compute_90a
|
| 665 |
+
* - \c compute_100
|
| 666 |
+
* - \c compute_100a
|
| 667 |
+
* - \c sm_50
|
| 668 |
+
* - \c sm_52
|
| 669 |
+
* - \c sm_53
|
| 670 |
+
* - \c sm_60
|
| 671 |
+
* - \c sm_61
|
| 672 |
+
* - \c sm_62
|
| 673 |
+
* - \c sm_70
|
| 674 |
+
* - \c sm_72
|
| 675 |
+
* - \c sm_75
|
| 676 |
+
* - \c sm_80
|
| 677 |
+
* - \c sm_87
|
| 678 |
+
* - \c sm_89
|
| 679 |
+
* - \c sm_90
|
| 680 |
+
* - \c sm_90a
|
| 681 |
+
* - \c sm_100
|
| 682 |
+
* - \c sm_100a
|
| 683 |
+
* - Default: \c compute_52
|
| 684 |
+
* - Separate compilation / whole-program compilation
|
| 685 |
+
* - \c --device-c (\c -dc)
|
| 686 |
+
*
|
| 687 |
+
* Generate relocatable code that can be linked with other relocatable
|
| 688 |
+
* device code. It is equivalent to \c --relocatable-device-code=true.
|
| 689 |
+
* - \c --device-w (\c -dw)
|
| 690 |
+
*
|
| 691 |
+
* Generate non-relocatable code. It is equivalent to \c --relocatable-device-code=false.
|
| 692 |
+
* - \c --relocatable-device-code={true|false} (\c -rdc)
|
| 693 |
+
*
|
| 694 |
+
* Enable (disable) the generation of relocatable device code.
|
| 695 |
+
* - Default: \c false
|
| 696 |
+
* - \c --extensible-whole-program (\c -ewp)
|
| 697 |
+
*
|
| 698 |
+
* Do extensible whole program compilation of device code.
|
| 699 |
+
* - Default: \c false
|
| 700 |
+
* - Debugging support
|
| 701 |
+
* - \c --device-debug (\c -G)
|
| 702 |
+
*
|
| 703 |
+
* Generate debug information. If \c --dopt is not specified, then turns off all optimizations.
|
| 704 |
+
* - \c --generate-line-info (\c -lineinfo)
|
| 705 |
+
*
|
| 706 |
+
* Generate line-number information.
|
| 707 |
+
* - Code generation
|
| 708 |
+
* - \c --dopt \c on (\c -dopt)
|
| 709 |
+
*
|
| 710 |
+
* - \c --dopt=on
|
| 711 |
+
*
|
| 712 |
+
* Enable device code optimization. When specified along with \c -G, enables
|
| 713 |
+
* limited debug information generation for optimized device code (currently,
|
| 714 |
+
* only line number information). When \c -G is not specified, \c -dopt=on is implicit.
|
| 715 |
+
*
|
| 716 |
+
* - \c --ptxas-options \<options\> (\c -Xptxas)
|
| 717 |
+
*
|
| 718 |
+
* - \c --ptxas-options=\<options\>
|
| 719 |
+
*
|
| 720 |
+
* Specify options directly to ptxas, the PTX optimizing assembler.
|
| 721 |
+
* - \c --maxrregcount=\<N\> (\c -maxrregcount)
|
| 722 |
+
*
|
| 723 |
+
* Specify the maximum amount of registers that GPU functions can use.
|
| 724 |
+
* Until a function-specific limit, a higher value will generally
|
| 725 |
+
* increase the performance of individual GPU threads that execute this
|
| 726 |
+
* function. However, because thread registers are allocated from a
|
| 727 |
+
* global register pool on each GPU, a higher value of this option will
|
| 728 |
+
* also reduce the maximum thread block size, thereby reducing the amount
|
| 729 |
+
* of thread parallelism. Hence, a good maxrregcount value is the result
|
| 730 |
+
* of a trade-off. If this option is not specified, then no maximum is
|
| 731 |
+
* assumed. Value less than the minimum registers required by ABI will
|
| 732 |
+
* be bumped up by the compiler to ABI minimum limit.
|
| 733 |
+
*
|
| 734 |
+
* - \c --ftz={true|false} (\c -ftz)
|
| 735 |
+
*
|
| 736 |
+
* When performing single-precision floating-point operations, flush
|
| 737 |
+
* denormal values to zero or preserve denormal values.
|
| 738 |
+
*
|
| 739 |
+
* \c --use_fast_math implies \c --ftz=true.
|
| 740 |
+
* - Default: \c false
|
| 741 |
+
*
|
| 742 |
+
* - \c --prec-sqrt={true|false} (\c -prec-sqrt)
|
| 743 |
+
*
|
| 744 |
+
* For single-precision floating-point square root, use IEEE
|
| 745 |
+
* round-to-nearest mode or use a faster approximation.
|
| 746 |
+
* \c --use_fast_math implies \c --prec-sqrt=false.
|
| 747 |
+
* - Default: \c true
|
| 748 |
+
*
|
| 749 |
+
* - \c --prec-div={true|false} (\c -prec-div)
|
| 750 |
+
* For single-precision floating-point division and reciprocals, use IEEE
|
| 751 |
+
* round-to-nearest mode or use a faster approximation.
|
| 752 |
+
* \c --use_fast_math implies \c --prec-div=false.
|
| 753 |
+
* - Default: \c true
|
| 754 |
+
*
|
| 755 |
+
* - \c --fmad={true|false} (\c -fmad)
|
| 756 |
+
*
|
| 757 |
+
* Enables (disables) the contraction of floating-point multiplies and
|
| 758 |
+
* adds/subtracts into floating-point multiply-add operations (FMAD,
|
| 759 |
+
* FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
|
| 760 |
+
* - Default: \c true
|
| 761 |
+
*
|
| 762 |
+
* - \c --use_fast_math (\c -use_fast_math)
|
| 763 |
+
*
|
| 764 |
+
* Make use of fast math operations.
|
| 765 |
+
* \c --use_fast_math implies \c --ftz=true \c --prec-div=false
|
| 766 |
+
* \c --prec-sqrt=false \c --fmad=true.
|
| 767 |
+
*
|
| 768 |
+
* - \c --extra-device-vectorization (\c -extra-device-vectorization)
|
| 769 |
+
*
|
| 770 |
+
* Enables more aggressive device code vectorization in the NVVM optimizer.
|
| 771 |
+
*
|
| 772 |
+
* - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)
|
| 773 |
+
*
|
| 774 |
+
* On Linux, during compilation, use \c setrlimit() to increase stack size
|
| 775 |
+
* to maximum allowed. The limit is reset to the previous value at the
|
| 776 |
+
* end of compilation.
|
| 777 |
+
* Note: \c setrlimit() changes the value for the entire process.
|
| 778 |
+
* - Default: \c true
|
| 779 |
+
*
|
| 780 |
+
* - \c --dlink-time-opt (\c -dlto)
|
| 781 |
+
*
|
| 782 |
+
* Generate intermediate code for later link-time optimization.
|
| 783 |
+
* It implies \c -rdc=true.
|
| 784 |
+
* Note: when this option is used the \c nvrtcGetLTOIR API should be used,
|
| 785 |
+
* as PTX or Cubin will not be generated.
|
| 786 |
+
*
|
| 787 |
+
* - \c --gen-opt-lto (\c -gen-opt-lto)
|
| 788 |
+
*
|
| 789 |
+
* Run the optimizer passes before generating the LTO IR.
|
| 790 |
+
*
|
| 791 |
+
* - \c --optix-ir (\c -optix-ir)
|
| 792 |
+
*
|
| 793 |
+
* Generate OptiX IR. The Optix IR is only intended for consumption by OptiX
|
| 794 |
+
* through appropriate APIs. This feature is not supported with
|
| 795 |
+
* link-time-optimization (\c -dlto).
|
| 796 |
+
*
|
| 797 |
+
* Note: when this option is used the nvrtcGetOptiX API should be used,
|
| 798 |
+
* as PTX or Cubin will not be generated.
|
| 799 |
+
*
|
| 800 |
+
* - \c --jump-table-density=[0-101] (\c -jtd)
|
| 801 |
+
*
|
| 802 |
+
* Specify the case density percentage in switch statements, and use it as
|
| 803 |
+
* a minimal threshold to determine whether jump table(brx.idx instruction)
|
| 804 |
+
* will be used to implement a switch statement. Default value is 101. The
|
| 805 |
+
* percentage ranges from 0 to 101 inclusively.
|
| 806 |
+
*
|
| 807 |
+
* - \c --device-stack-protector={true|false} (\c -device-stack-protector)
|
| 808 |
+
*
|
| 809 |
+
* Enable (disable) the generation of stack canaries in device code.
|
| 810 |
+
*
|
| 811 |
+
* - Default: \c false
|
| 812 |
+
*
|
| 813 |
+
* - Preprocessing
|
| 814 |
+
* - \c --define-macro=\<def\> (\c -D)
|
| 815 |
+
*
|
| 816 |
+
* \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
|
| 817 |
+
* - \c \<name\>
|
| 818 |
+
*
|
| 819 |
+
* Predefine \c \<name\> as a macro with definition \c 1.
|
| 820 |
+
* - \c \<name\>=\<definition\>
|
| 821 |
+
*
|
| 822 |
+
* The contents of \c \<definition\> are tokenized and preprocessed
|
| 823 |
+
* as if they appeared during translation phase three in a \c \#define
|
| 824 |
+
* directive. In particular, the definition will be truncated by
|
| 825 |
+
* embedded new line characters.
|
| 826 |
+
*
|
| 827 |
+
* - \c --undefine-macro=\<def\> (\c -U)
|
| 828 |
+
*
|
| 829 |
+
* Cancel any previous definition of \c \<def\>.
|
| 830 |
+
*
|
| 831 |
+
* - \c --include-path=\<dir\> (\c -I)
|
| 832 |
+
*
|
| 833 |
+
* Add the directory \c \<dir\> to the list of directories to be
|
| 834 |
+
* searched for headers. These paths are searched after the list of
|
| 835 |
+
* headers given to ::nvrtcCreateProgram.
|
| 836 |
+
*
|
| 837 |
+
* - \c --pre-include=\<header\> (\c -include)
|
| 838 |
+
*
|
| 839 |
+
* Preinclude \c \<header\> during preprocessing.
|
| 840 |
+
*
|
| 841 |
+
* - \c --no-source-include (\c -no-source-include)
|
| 842 |
+
*
|
| 843 |
+
* The preprocessor by default adds the directory of each input sources
|
| 844 |
+
* to the include path. This option disables this feature and only
|
| 845 |
+
* considers the path specified explicitly.
|
| 846 |
+
*
|
| 847 |
+
* - Language Dialect
|
| 848 |
+
* - \c --std={c++03|c++11|c++14|c++17|c++20} (\c -std)
|
| 849 |
+
*
|
| 850 |
+
* Set language dialect to C++03, C++11, C++14, C++17 or C++20
|
| 851 |
+
* - Default: \c c++17
|
| 852 |
+
*
|
| 853 |
+
* - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)
|
| 854 |
+
*
|
| 855 |
+
* Provide builtin definitions of \c std::move and \c std::forward,
|
| 856 |
+
* when C++11 or later language dialect is selected.
|
| 857 |
+
* - Default: \c true
|
| 858 |
+
*
|
| 859 |
+
* - \c --builtin-initializer-list={true|false}
|
| 860 |
+
* (\c -builtin-initializer-list)
|
| 861 |
+
*
|
| 862 |
+
* Provide builtin definitions of \c std::initializer_list class and
|
| 863 |
+
* member functions when C++11 or later language dialect is selected.
|
| 864 |
+
* - Default: \c true
|
| 865 |
+
*
|
| 866 |
+
* - Precompiled header support (CUDA 12.8+)
|
| 867 |
+
* - \c --pch (\c -pch)
|
| 868 |
+
*
|
| 869 |
+
* Enable automatic PCH processing.
|
| 870 |
+
*
|
| 871 |
+
* - \c --create-pch=<file-name> (\c -create-pch)
|
| 872 |
+
*
|
| 873 |
+
* Create a PCH file.
|
| 874 |
+
*
|
| 875 |
+
* - \c --use-pch=<file-name> (\c -use-pch)
|
| 876 |
+
*
|
| 877 |
+
* Use the specified PCH file.
|
| 878 |
+
*
|
| 879 |
+
* - \c --pch-dir=<directory-name> (\c -pch-dir)
|
| 880 |
+
*
|
| 881 |
+
* When using automatic PCH (\c -pch), look for and create PCH files in the
|
| 882 |
+
* specified directory. When using explicit PCH (\c -create-pch or \c -use-pch),
|
| 883 |
+
* the directory name is prefixed before the specified file name, unless
|
| 884 |
+
* the file name is an absolute path name.
|
| 885 |
+
*
|
| 886 |
+
* - \c --pch-verbose={true|false} (\c -pch-verbose)
|
| 887 |
+
*
|
| 888 |
+
* In automatic PCH mode, for each PCH file that could not be used in current
|
| 889 |
+
* compilation, print the reason in the compilation log.
|
| 890 |
+
* - Default: \c true
|
| 891 |
+
*
|
| 892 |
+
* - \c --pch-messages={true|false} (\c -pch-messages)
|
| 893 |
+
*
|
| 894 |
+
* Print a message in the compilation log, if a PCH file was created or used
|
| 895 |
+
* in the current compilation.
|
| 896 |
+
* - Default: \c true
|
| 897 |
+
*
|
| 898 |
+
* - \c --instantiate-templates-in-pch={true|false} (\c -instantiate-templates-in-pch)
|
| 899 |
+
*
|
| 900 |
+
* Enable or disable instantiatiation of templates before PCH creation. Instantiating
|
| 901 |
+
* templates may increase the size of the PCH file, while reducing the compilation
|
| 902 |
+
* cost when using the PCH file (since some template instantiations can be skipped).
|
| 903 |
+
* - Default: \c true
|
| 904 |
+
*
|
| 905 |
+
* - Misc.
|
| 906 |
+
* - \c --disable-warnings (\c -w)
|
| 907 |
+
*
|
| 908 |
+
* Inhibit all warning messages.
|
| 909 |
+
*
|
| 910 |
+
* - \c --restrict (\c -restrict)
|
| 911 |
+
*
|
| 912 |
+
* Programmer assertion that all kernel pointer parameters are restrict
|
| 913 |
+
* pointers.
|
| 914 |
+
*
|
| 915 |
+
* - \c --device-as-default-execution-space
|
| 916 |
+
* (\c -default-device)
|
| 917 |
+
*
|
| 918 |
+
* Treat entities with no execution space annotation as \c __device__
|
| 919 |
+
* entities.
|
| 920 |
+
*
|
| 921 |
+
* - \c --device-int128 (\c -device-int128)
|
| 922 |
+
*
|
| 923 |
+
* Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
|
| 924 |
+
* to be defined.
|
| 925 |
+
*
|
| 926 |
+
* - \c --device-float128 (\c -device-float128)
|
| 927 |
+
*
|
| 928 |
+
* Allow the \c __float128 and \c _Float128 types in device code. Also
|
| 929 |
+
* causes the macro \c D__CUDACC_RTC_FLOAT128__ to be defined.
|
| 930 |
+
*
|
| 931 |
+
* - \c --optimization-info=\<kind\> (\c -opt-info)
|
| 932 |
+
*
|
| 933 |
+
* Provide optimization reports for the specified kind of optimization.
|
| 934 |
+
* The following kind tags are supported:
|
| 935 |
+
* - \c inline : emit a remark when a function is inlined.
|
| 936 |
+
*
|
| 937 |
+
* - \c --display-error-number (\c -err-no)
|
| 938 |
+
*
|
| 939 |
+
* Display diagnostic number for warning messages. (Default)
|
| 940 |
+
*
|
| 941 |
+
* - \c --no-display-error-number (\c -no-err-no)
|
| 942 |
+
*
|
| 943 |
+
* Disables the display of a diagnostic number for warning messages.
|
| 944 |
+
*
|
| 945 |
+
* - \c --diag-error=<error-number>,... (\c -diag-error)
|
| 946 |
+
*
|
| 947 |
+
* Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 948 |
+
*
|
| 949 |
+
* - \c --diag-suppress=<error-number>,... (\c -diag-suppress)
|
| 950 |
+
*
|
| 951 |
+
* Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 952 |
+
*
|
| 953 |
+
* - \c --diag-warn=<error-number>,... (\c -diag-warn)
|
| 954 |
+
*
|
| 955 |
+
* Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
|
| 956 |
+
*
|
| 957 |
+
* - \c --brief-diagnostics={true|false} (\c -brief-diag)
|
| 958 |
+
*
|
| 959 |
+
* This option disables or enables showing source line and column info
|
| 960 |
+
* in a diagnostic.
|
| 961 |
+
* The \c --brief-diagnostics=true will not show the source line and column info.
|
| 962 |
+
* - Default: \c false
|
| 963 |
+
*
|
| 964 |
+
* - \c --time=<file-name> (\c -time)
|
| 965 |
+
*
|
| 966 |
+
* Generate a comma separated value table with the time taken by each compilation
|
| 967 |
+
* phase, and append it at the end of the file given as the option argument.
|
| 968 |
+
* If the file does not exist, the column headings are generated in the first row
|
| 969 |
+
* of the table. If the file name is '-', the timing data is written to the compilation log.
|
| 970 |
+
*
|
| 971 |
+
* - \c --split-compile=<number-of-threads> (\c -split-compile=<number-of-threads>)
|
| 972 |
+
*
|
| 973 |
+
* Perform compiler optimizations in parallel.
|
| 974 |
+
* Split compilation attempts to reduce compile time by enabling the compiler to run certain
|
| 975 |
+
* optimization passes concurrently. This option accepts a numerical value that specifies the
|
| 976 |
+
* maximum number of threads the compiler can use. One can also allow the compiler to use the maximum
|
| 977 |
+
* threads available on the system by setting \c --split-compile=0.
|
| 978 |
+
* Setting \c --split-compile=1 will cause this option to be ignored.
|
| 979 |
+
*
|
| 980 |
+
* - \c --fdevice-syntax-only (\c -fdevice-syntax-only)
|
| 981 |
+
*
|
| 982 |
+
* Ends device compilation after front-end syntax checking. This option does not generate valid
|
| 983 |
+
* device code.
|
| 984 |
+
*
|
| 985 |
+
* - \c --minimal (\c -minimal)
|
| 986 |
+
*
|
| 987 |
+
* Omit certain language features to reduce compile time for small programs.
|
| 988 |
+
* In particular, the following are omitted:
|
| 989 |
+
* - Texture and surface functions and associated types, e.g., \c cudaTextureObject_t.
|
| 990 |
+
* - CUDA Runtime Functions that are provided by the cudadevrt device code library,
|
| 991 |
+
* typically named with prefix "cuda", e.g., \c cudaMalloc.
|
| 992 |
+
* - Kernel launch from device code.
|
| 993 |
+
* - Types and macros associated with CUDA Runtime and Driver APIs,
|
| 994 |
+
* provided by \c cuda/tools/cudart/driver_types.h, typically named with prefix "cuda", e.g., \c cudaError_t.
|
| 995 |
+
*
|
| 996 |
+
* - \c --device-stack-protector (\c -device-stack-protector)
|
| 997 |
+
*
|
| 998 |
+
* Enable stack canaries in device code.
|
| 999 |
+
* Stack canaries make it more difficult to exploit certain types of memory safety bugs involving
|
| 1000 |
+
* stack-local variables. The compiler uses heuristics to assess the risk of such a bug in each function.
|
| 1001 |
+
* Only those functions which are deemed high-risk make use of a stack canary.
|
| 1002 |
+
*
|
| 1003 |
+
* - \c --fdevice-time-trace=<file-name> (\c -fdevice-time-trace=<file-name>)
|
| 1004 |
+
* Enables the time profiler, outputting a JSON file based on given <file-name>. Results can be analyzed on
|
| 1005 |
+
* chrome://tracing for a flamegraph visualization.
|
| 1006 |
+
*
|
| 1007 |
+
*/
|
| 1008 |
+
|
| 1009 |
+
#ifdef __cplusplus
|
| 1010 |
+
}
|
| 1011 |
+
#endif /* __cplusplus */
|
| 1012 |
+
|
| 1013 |
+
|
| 1014 |
+
/* The utility function 'nvrtcGetTypeName' is not available by default. Define
|
| 1015 |
+
the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
|
| 1016 |
+
*/
|
| 1017 |
+
|
| 1018 |
+
#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
|
| 1019 |
+
|
| 1020 |
+
#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
|
| 1021 |
+
#include <cxxabi.h>
|
| 1022 |
+
#include <cstdlib>
|
| 1023 |
+
|
| 1024 |
+
#elif defined(_WIN32)
|
| 1025 |
+
#include <Windows.h>
|
| 1026 |
+
#include <DbgHelp.h>
|
| 1027 |
+
#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
|
| 1028 |
+
|
| 1029 |
+
|
| 1030 |
+
#include <string>
|
| 1031 |
+
#include <typeinfo>
|
| 1032 |
+
|
| 1033 |
+
template <typename T> struct __nvrtcGetTypeName_helper_t { };
|
| 1034 |
+
|
| 1035 |
+
/*************************************************************************//**
|
| 1036 |
+
*
|
| 1037 |
+
* \defgroup hosthelper Host Helper
|
| 1038 |
+
*
|
| 1039 |
+
* NVRTC defines the following functions for easier interaction with host code.
|
| 1040 |
+
*
|
| 1041 |
+
****************************************************************************/
|
| 1042 |
+
|
| 1043 |
+
/**
|
| 1044 |
+
* \ingroup hosthelper
|
| 1045 |
+
* \brief nvrtcGetTypeName stores the source level name of a type in the given
|
| 1046 |
+
* std::string location.
|
| 1047 |
+
*
|
| 1048 |
+
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
|
| 1049 |
+
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
|
| 1050 |
+
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
|
| 1051 |
+
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
|
| 1052 |
+
* otherwise *result is initialized with the extracted name.
|
| 1053 |
+
*
|
| 1054 |
+
* Windows-specific notes:
|
| 1055 |
+
* - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
|
| 1056 |
+
* which is not multi-thread safe.
|
| 1057 |
+
* - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
|
| 1058 |
+
*
|
| 1059 |
+
* \param [in] tinfo: reference to object of type std::type_info for a given type.
|
| 1060 |
+
* \param [in] result: pointer to std::string in which to store the type name.
|
| 1061 |
+
* \return
|
| 1062 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 1063 |
+
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
|
| 1064 |
+
*
|
| 1065 |
+
*/
|
| 1066 |
+
inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
|
| 1067 |
+
{
|
| 1068 |
+
#if USE_CXXABI || __clang__ || __GNUC__
|
| 1069 |
+
const char *name = tinfo.name();
|
| 1070 |
+
int status;
|
| 1071 |
+
char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
|
| 1072 |
+
if (status == 0) {
|
| 1073 |
+
*result = undecorated_name;
|
| 1074 |
+
free(undecorated_name);
|
| 1075 |
+
return NVRTC_SUCCESS;
|
| 1076 |
+
}
|
| 1077 |
+
#elif defined(_WIN32)
|
| 1078 |
+
const char *name = tinfo.raw_name();
|
| 1079 |
+
if (!name || *name != '.') {
|
| 1080 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 1081 |
+
}
|
| 1082 |
+
char undecorated_name[4096];
|
| 1083 |
+
//name+1 skips over the '.' prefix
|
| 1084 |
+
if(UnDecorateSymbolName(name+1, undecorated_name,
|
| 1085 |
+
sizeof(undecorated_name) / sizeof(*undecorated_name),
|
| 1086 |
+
//note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
|
| 1087 |
+
UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
|
| 1088 |
+
*result = undecorated_name;
|
| 1089 |
+
return NVRTC_SUCCESS;
|
| 1090 |
+
}
|
| 1091 |
+
#endif /* USE_CXXABI || __clang__ || __GNUC__ */
|
| 1092 |
+
|
| 1093 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 1094 |
+
}
|
| 1095 |
+
|
| 1096 |
+
/**
|
| 1097 |
+
* \ingroup hosthelper
|
| 1098 |
+
* \brief nvrtcGetTypeName stores the source level name of the template type argument
|
| 1099 |
+
* T in the given std::string location.
|
| 1100 |
+
*
|
| 1101 |
+
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
|
| 1102 |
+
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
|
| 1103 |
+
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
|
| 1104 |
+
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
|
| 1105 |
+
* otherwise *result is initialized with the extracted name.
|
| 1106 |
+
*
|
| 1107 |
+
* Windows-specific notes:
|
| 1108 |
+
* - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
|
| 1109 |
+
* which is not multi-thread safe.
|
| 1110 |
+
* - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
|
| 1111 |
+
*
|
| 1112 |
+
* \param [in] result: pointer to std::string in which to store the type name.
|
| 1113 |
+
* \return
|
| 1114 |
+
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
|
| 1115 |
+
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
|
| 1116 |
+
*
|
| 1117 |
+
*/
|
| 1118 |
+
|
| 1119 |
+
template <typename T>
|
| 1120 |
+
nvrtcResult nvrtcGetTypeName(std::string *result)
|
| 1121 |
+
{
|
| 1122 |
+
nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
|
| 1123 |
+
result);
|
| 1124 |
+
if (res != NVRTC_SUCCESS)
|
| 1125 |
+
return res;
|
| 1126 |
+
|
| 1127 |
+
std::string repr = *result;
|
| 1128 |
+
std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
|
| 1129 |
+
idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
|
| 1130 |
+
std::size_t last_idx = repr.find_last_of('>');
|
| 1131 |
+
if (idx == std::string::npos || last_idx == std::string::npos) {
|
| 1132 |
+
return NVRTC_ERROR_INTERNAL_ERROR;
|
| 1133 |
+
}
|
| 1134 |
+
++idx;
|
| 1135 |
+
*result = repr.substr(idx, last_idx - idx);
|
| 1136 |
+
return NVRTC_SUCCESS;
|
| 1137 |
+
}
|
| 1138 |
+
|
| 1139 |
+
#endif /* NVRTC_GET_TYPE_NAME */
|
| 1140 |
+
|
| 1141 |
+
#endif /* __NVRTC_H__ */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (227 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (225 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__init__.py
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (233 Bytes). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/builtin_types.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*******************************************************************************
|
| 51 |
+
* *
|
| 52 |
+
* *
|
| 53 |
+
* *
|
| 54 |
+
*******************************************************************************/
|
| 55 |
+
|
| 56 |
+
#include "device_types.h"
|
| 57 |
+
#if !defined(__CUDACC_RTC__)
|
| 58 |
+
#define EXCLUDE_FROM_RTC
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
#undef EXCLUDE_FROM_RTC
|
| 61 |
+
#endif /* !__CUDACC_RTC__ */
|
| 62 |
+
#include "surface_types.h"
|
| 63 |
+
#include "texture_types.h"
|
| 64 |
+
#include "vector_types.h"
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h
ADDED
|
@@ -0,0 +1,597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CHANNEL_DESCRIPTOR_H__)
|
| 51 |
+
#define __CHANNEL_DESCRIPTOR_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#include "cuda_runtime_api.h"
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \addtogroup CUDART_HIGHLEVEL
|
| 71 |
+
*
|
| 72 |
+
* @{
|
| 73 |
+
*/
|
| 74 |
+
|
| 75 |
+
/**
|
| 76 |
+
* \brief \hl Returns a channel descriptor using the specified format
|
| 77 |
+
*
|
| 78 |
+
* Returns a channel descriptor with format \p f and number of bits of each
|
| 79 |
+
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
|
| 80 |
+
* defined as:
|
| 81 |
+
* \code
|
| 82 |
+
struct cudaChannelFormatDesc {
|
| 83 |
+
int x, y, z, w;
|
| 84 |
+
enum cudaChannelFormatKind f;
|
| 85 |
+
};
|
| 86 |
+
* \endcode
|
| 87 |
+
*
|
| 88 |
+
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
|
| 89 |
+
* ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
|
| 90 |
+
* ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
|
| 91 |
+
* ::cudaChannelFormatKindSignedNormalized8X4,
|
| 92 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
|
| 93 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X4,
|
| 94 |
+
* ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
|
| 95 |
+
* ::cudaChannelFormatKindSignedNormalized16X4,
|
| 96 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
|
| 97 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X4,
|
| 98 |
+
* ::cudaChannelFormatKindUnsignedNormalized1010102
|
| 99 |
+
* or ::cudaChannelFormatKindNV12.
|
| 100 |
+
*
|
| 101 |
+
* The format is specified by the template specialization.
|
| 102 |
+
*
|
| 103 |
+
* The template function specializes for the following scalar types:
|
| 104 |
+
* char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
|
| 105 |
+
* The template function specializes for the following vector types:
|
| 106 |
+
* char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
|
| 107 |
+
* The template function specializes for following cudaChannelFormatKind enum values:
|
| 108 |
+
* ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4},
|
| 109 |
+
* ::cudaChannelFormatKindUnsignedNormalized1010102
|
| 110 |
+
* and ::cudaChannelFormatKindNV12.
|
| 111 |
+
*
|
| 112 |
+
* Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
|
| 113 |
+
*
|
| 114 |
+
* \return
|
| 115 |
+
* Channel descriptor with format \p f
|
| 116 |
+
*
|
| 117 |
+
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
|
| 118 |
+
* ::cudaGetChannelDesc,
|
| 119 |
+
*/
|
| 120 |
+
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 121 |
+
{
|
| 122 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
|
| 126 |
+
{
|
| 127 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 128 |
+
|
| 129 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
|
| 133 |
+
{
|
| 134 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 135 |
+
|
| 136 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
|
| 140 |
+
{
|
| 141 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 142 |
+
|
| 143 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
|
| 147 |
+
{
|
| 148 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 149 |
+
|
| 150 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
|
| 154 |
+
{
|
| 155 |
+
int e = (int)sizeof(char) * 8;
|
| 156 |
+
|
| 157 |
+
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
|
| 158 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 159 |
+
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 160 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 161 |
+
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
|
| 165 |
+
{
|
| 166 |
+
int e = (int)sizeof(signed char) * 8;
|
| 167 |
+
|
| 168 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
|
| 172 |
+
{
|
| 173 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 174 |
+
|
| 175 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
|
| 179 |
+
{
|
| 180 |
+
int e = (int)sizeof(signed char) * 8;
|
| 181 |
+
|
| 182 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
|
| 186 |
+
{
|
| 187 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 188 |
+
|
| 189 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
|
| 193 |
+
{
|
| 194 |
+
int e = (int)sizeof(signed char) * 8;
|
| 195 |
+
|
| 196 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
|
| 200 |
+
{
|
| 201 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 202 |
+
|
| 203 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
|
| 207 |
+
{
|
| 208 |
+
int e = (int)sizeof(signed char) * 8;
|
| 209 |
+
|
| 210 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
|
| 214 |
+
{
|
| 215 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 216 |
+
|
| 217 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
|
| 221 |
+
{
|
| 222 |
+
int e = (int)sizeof(short) * 8;
|
| 223 |
+
|
| 224 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
|
| 228 |
+
{
|
| 229 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 230 |
+
|
| 231 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
|
| 235 |
+
{
|
| 236 |
+
int e = (int)sizeof(short) * 8;
|
| 237 |
+
|
| 238 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
|
| 242 |
+
{
|
| 243 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 244 |
+
|
| 245 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
|
| 249 |
+
{
|
| 250 |
+
int e = (int)sizeof(short) * 8;
|
| 251 |
+
|
| 252 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
|
| 256 |
+
{
|
| 257 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 258 |
+
|
| 259 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
|
| 263 |
+
{
|
| 264 |
+
int e = (int)sizeof(short) * 8;
|
| 265 |
+
|
| 266 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
|
| 270 |
+
{
|
| 271 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 272 |
+
|
| 273 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
|
| 277 |
+
{
|
| 278 |
+
int e = (int)sizeof(int) * 8;
|
| 279 |
+
|
| 280 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
|
| 284 |
+
{
|
| 285 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 286 |
+
|
| 287 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
|
| 291 |
+
{
|
| 292 |
+
int e = (int)sizeof(int) * 8;
|
| 293 |
+
|
| 294 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
|
| 298 |
+
{
|
| 299 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 300 |
+
|
| 301 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
|
| 305 |
+
{
|
| 306 |
+
int e = (int)sizeof(int) * 8;
|
| 307 |
+
|
| 308 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
|
| 312 |
+
{
|
| 313 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 314 |
+
|
| 315 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
|
| 319 |
+
{
|
| 320 |
+
int e = (int)sizeof(int) * 8;
|
| 321 |
+
|
| 322 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
|
| 326 |
+
{
|
| 327 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 328 |
+
|
| 329 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
#if !defined(__LP64__)
|
| 333 |
+
|
| 334 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
|
| 335 |
+
{
|
| 336 |
+
int e = (int)sizeof(long) * 8;
|
| 337 |
+
|
| 338 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
|
| 342 |
+
{
|
| 343 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 344 |
+
|
| 345 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
|
| 349 |
+
{
|
| 350 |
+
int e = (int)sizeof(long) * 8;
|
| 351 |
+
|
| 352 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
|
| 356 |
+
{
|
| 357 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 358 |
+
|
| 359 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
|
| 363 |
+
{
|
| 364 |
+
int e = (int)sizeof(long) * 8;
|
| 365 |
+
|
| 366 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
|
| 370 |
+
{
|
| 371 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 372 |
+
|
| 373 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
|
| 377 |
+
{
|
| 378 |
+
int e = (int)sizeof(long) * 8;
|
| 379 |
+
|
| 380 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
|
| 384 |
+
{
|
| 385 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 386 |
+
|
| 387 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
#endif /* !__LP64__ */
|
| 391 |
+
|
| 392 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
|
| 393 |
+
{
|
| 394 |
+
int e = (int)sizeof(float) * 8;
|
| 395 |
+
|
| 396 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
|
| 400 |
+
{
|
| 401 |
+
int e = (int)sizeof(float) * 8;
|
| 402 |
+
|
| 403 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
|
| 407 |
+
{
|
| 408 |
+
int e = (int)sizeof(float) * 8;
|
| 409 |
+
|
| 410 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
|
| 414 |
+
{
|
| 415 |
+
int e = (int)sizeof(float) * 8;
|
| 416 |
+
|
| 417 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
|
| 421 |
+
{
|
| 422 |
+
int e = (int)sizeof(char) * 8;
|
| 423 |
+
|
| 424 |
+
return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 428 |
+
{
|
| 429 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
/* Signed 8-bit normalized integer formats */
|
| 433 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
|
| 434 |
+
{
|
| 435 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
|
| 439 |
+
{
|
| 440 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
|
| 444 |
+
{
|
| 445 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
/* Unsigned 8-bit normalized integer formats */
|
| 449 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
|
| 450 |
+
{
|
| 451 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
|
| 455 |
+
{
|
| 456 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
|
| 460 |
+
{
|
| 461 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
/* Signed 16-bit normalized integer formats */
|
| 465 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
|
| 466 |
+
{
|
| 467 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
|
| 471 |
+
{
|
| 472 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
|
| 476 |
+
{
|
| 477 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
/* Unsigned 16-bit normalized integer formats */
|
| 481 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
|
| 482 |
+
{
|
| 483 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
|
| 487 |
+
{
|
| 488 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
|
| 492 |
+
{
|
| 493 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
/* NV12 format */
|
| 497 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
|
| 498 |
+
{
|
| 499 |
+
return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
/* Int101010 format */
|
| 503 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized1010102>(void)
|
| 504 |
+
{
|
| 505 |
+
return cudaCreateChannelDesc(10, 10, 10, 2, cudaChannelFormatKindUnsignedNormalized1010102);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
/* BC1 format */
|
| 509 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
|
| 510 |
+
{
|
| 511 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
/* BC1sRGB format */
|
| 515 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
|
| 516 |
+
{
|
| 517 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
/* BC2 format */
|
| 521 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
|
| 522 |
+
{
|
| 523 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
/* BC2sRGB format */
|
| 527 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
|
| 528 |
+
{
|
| 529 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
/* BC3 format */
|
| 533 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
|
| 534 |
+
{
|
| 535 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
/* BC3sRGB format */
|
| 539 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
|
| 540 |
+
{
|
| 541 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
/* BC4 unsigned format */
|
| 545 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
|
| 546 |
+
{
|
| 547 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
/* BC4 signed format */
|
| 551 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
|
| 552 |
+
{
|
| 553 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
/* BC5 unsigned format */
|
| 557 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
|
| 558 |
+
{
|
| 559 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
/* BC5 signed format */
|
| 563 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
|
| 564 |
+
{
|
| 565 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
/* BC6H unsigned format */
|
| 569 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
|
| 570 |
+
{
|
| 571 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
/* BC6H signed format */
|
| 575 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
|
| 576 |
+
{
|
| 577 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
/* BC7 format */
|
| 581 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
|
| 582 |
+
{
|
| 583 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
/* BC7sRGB format */
|
| 587 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
|
| 588 |
+
{
|
| 589 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
#endif /* __cplusplus */
|
| 593 |
+
|
| 594 |
+
/** @} */
|
| 595 |
+
/** @} */ /* END CUDART_TEXTURE_HL */
|
| 596 |
+
|
| 597 |
+
#endif /* !__CHANNEL_DESCRIPTOR_H__ */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/common_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/common_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h
ADDED
|
@@ -0,0 +1,1743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _COOPERATIVE_GROUPS_H_
|
| 51 |
+
#define _COOPERATIVE_GROUPS_H_
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
#include "cooperative_groups/details/info.h"
|
| 56 |
+
#include "cooperative_groups/details/driver_abi.h"
|
| 57 |
+
#include "cooperative_groups/details/helpers.h"
|
| 58 |
+
#include "cooperative_groups/details/memory.h"
|
| 59 |
+
|
| 60 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 61 |
+
#include <cuda/atomic>
|
| 62 |
+
#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
|
| 63 |
+
#else
|
| 64 |
+
#define _CG_THREAD_SCOPE(scope)
|
| 65 |
+
#endif
|
| 66 |
+
|
| 67 |
+
_CG_BEGIN_NAMESPACE
|
| 68 |
+
|
| 69 |
+
namespace details {
|
| 70 |
+
_CG_CONST_DECL unsigned int coalesced_group_id = 1;
|
| 71 |
+
_CG_CONST_DECL unsigned int multi_grid_group_id = 2;
|
| 72 |
+
_CG_CONST_DECL unsigned int grid_group_id = 3;
|
| 73 |
+
_CG_CONST_DECL unsigned int thread_block_id = 4;
|
| 74 |
+
_CG_CONST_DECL unsigned int multi_tile_group_id = 5;
|
| 75 |
+
_CG_CONST_DECL unsigned int cluster_group_id = 6;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* class thread_group;
|
| 80 |
+
*
|
| 81 |
+
* Generic thread group type, into which all groups are convertible.
|
| 82 |
+
* It acts as a container for all storage necessary for the derived groups,
|
| 83 |
+
* and will dispatch the API calls to the correct derived group. This means
|
| 84 |
+
* that all derived groups must implement the same interface as thread_group.
|
| 85 |
+
*/
|
| 86 |
+
class thread_group
|
| 87 |
+
{
|
| 88 |
+
protected:
|
| 89 |
+
struct group_data {
|
| 90 |
+
unsigned int _unused : 1;
|
| 91 |
+
unsigned int type : 7, : 0;
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
struct gg_data {
|
| 95 |
+
details::grid_workspace *gridWs;
|
| 96 |
+
};
|
| 97 |
+
|
| 98 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 99 |
+
struct mg_data {
|
| 100 |
+
unsigned long long _unused : 1;
|
| 101 |
+
unsigned long long type : 7;
|
| 102 |
+
unsigned long long handle : 56;
|
| 103 |
+
const details::multi_grid::multi_grid_functions *functions;
|
| 104 |
+
};
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
struct tg_data {
|
| 108 |
+
unsigned int is_tiled : 1;
|
| 109 |
+
unsigned int type : 7;
|
| 110 |
+
unsigned int size : 24;
|
| 111 |
+
// packed to 4b
|
| 112 |
+
unsigned int metaGroupSize : 16;
|
| 113 |
+
unsigned int metaGroupRank : 16;
|
| 114 |
+
// packed to 8b
|
| 115 |
+
unsigned int mask;
|
| 116 |
+
// packed to 12b
|
| 117 |
+
unsigned int _res;
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 121 |
+
friend class thread_block;
|
| 122 |
+
|
| 123 |
+
union __align__(8) {
|
| 124 |
+
group_data group;
|
| 125 |
+
tg_data coalesced;
|
| 126 |
+
gg_data grid;
|
| 127 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 128 |
+
mg_data multi_grid;
|
| 129 |
+
#endif
|
| 130 |
+
} _data;
|
| 131 |
+
|
| 132 |
+
_CG_QUALIFIER thread_group operator=(const thread_group& src);
|
| 133 |
+
|
| 134 |
+
_CG_QUALIFIER thread_group(unsigned int type) {
|
| 135 |
+
_data.group.type = type;
|
| 136 |
+
_data.group._unused = false;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
#ifdef _CG_CPP11_FEATURES
|
| 140 |
+
static_assert(sizeof(tg_data) <= 16, "Failed size check");
|
| 141 |
+
static_assert(sizeof(gg_data) <= 16, "Failed size check");
|
| 142 |
+
# ifdef _CG_ABI_EXPERIMENTAL
|
| 143 |
+
static_assert(sizeof(mg_data) <= 16, "Failed size check");
|
| 144 |
+
# endif
|
| 145 |
+
#endif
|
| 146 |
+
|
| 147 |
+
public:
|
| 148 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 149 |
+
|
| 150 |
+
_CG_QUALIFIER unsigned long long size() const;
|
| 151 |
+
_CG_QUALIFIER unsigned long long num_threads() const;
|
| 152 |
+
_CG_QUALIFIER unsigned long long thread_rank() const;
|
| 153 |
+
_CG_QUALIFIER void sync() const;
|
| 154 |
+
_CG_QUALIFIER unsigned int get_type() const {
|
| 155 |
+
return _data.group.type;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
};
|
| 159 |
+
|
| 160 |
+
template <unsigned int TyId>
|
| 161 |
+
struct thread_group_base : public thread_group {
|
| 162 |
+
_CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
|
| 163 |
+
_CG_STATIC_CONST_DECL unsigned int id = TyId;
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 167 |
+
|
| 168 |
+
/**
|
| 169 |
+
* class multi_grid_group;
|
| 170 |
+
*
|
| 171 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 172 |
+
* same system, on multiple devices within the same launched kernels.
|
| 173 |
+
* To use this group, the kernel must have been launched with
|
| 174 |
+
* cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
|
| 175 |
+
* and the device must support it (queryable device attribute).
|
| 176 |
+
*
|
| 177 |
+
* Constructed via this_multi_grid();
|
| 178 |
+
*/
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 182 |
+
class multi_grid_group;
|
| 183 |
+
|
| 184 |
+
// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
|
| 185 |
+
template <typename = void>
|
| 186 |
+
__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
|
| 187 |
+
|
| 188 |
+
class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
|
| 189 |
+
{
|
| 190 |
+
private:
|
| 191 |
+
template <typename = void>
|
| 192 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 193 |
+
_data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
|
| 194 |
+
_data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
friend multi_grid_group this_multi_grid<void>();
|
| 198 |
+
|
| 199 |
+
public:
|
| 200 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 201 |
+
|
| 202 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 203 |
+
return (_data.multi_grid.handle != 0);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
_CG_QUALIFIER void sync() const {
|
| 207 |
+
if (!is_valid()) {
|
| 208 |
+
_CG_ABORT();
|
| 209 |
+
}
|
| 210 |
+
_data.multi_grid.functions->sync(_data.multi_grid.handle);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
_CG_QUALIFIER unsigned long long num_threads() const {
|
| 214 |
+
_CG_ASSERT(is_valid());
|
| 215 |
+
return _data.multi_grid.functions->size(_data.multi_grid.handle);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
_CG_QUALIFIER unsigned long long size() const {
|
| 219 |
+
return num_threads();
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CG_QUALIFIER unsigned long long thread_rank() const {
|
| 223 |
+
_CG_ASSERT(is_valid());
|
| 224 |
+
return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
_CG_QUALIFIER unsigned int grid_rank() const {
|
| 228 |
+
_CG_ASSERT(is_valid());
|
| 229 |
+
return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
_CG_QUALIFIER unsigned int num_grids() const {
|
| 233 |
+
_CG_ASSERT(is_valid());
|
| 234 |
+
return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
|
| 235 |
+
}
|
| 236 |
+
};
|
| 237 |
+
# else
|
| 238 |
+
class multi_grid_group
|
| 239 |
+
{
|
| 240 |
+
private:
|
| 241 |
+
unsigned long long _handle;
|
| 242 |
+
unsigned int _size;
|
| 243 |
+
unsigned int _rank;
|
| 244 |
+
|
| 245 |
+
friend _CG_QUALIFIER multi_grid_group this_multi_grid();
|
| 246 |
+
|
| 247 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 248 |
+
_handle = details::multi_grid::get_intrinsic_handle();
|
| 249 |
+
_size = details::multi_grid::size(_handle);
|
| 250 |
+
_rank = details::multi_grid::thread_rank(_handle);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
public:
|
| 254 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 255 |
+
|
| 256 |
+
_CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
|
| 257 |
+
return (_handle != 0);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
_CG_QUALIFIER _CG_DEPRECATED void sync() const {
|
| 261 |
+
if (!is_valid()) {
|
| 262 |
+
_CG_ABORT();
|
| 263 |
+
}
|
| 264 |
+
details::multi_grid::sync(_handle);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
|
| 268 |
+
_CG_ASSERT(is_valid());
|
| 269 |
+
return _size;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
|
| 273 |
+
return num_threads();
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
|
| 277 |
+
_CG_ASSERT(is_valid());
|
| 278 |
+
return _rank;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
|
| 282 |
+
_CG_ASSERT(is_valid());
|
| 283 |
+
return (details::multi_grid::grid_rank(_handle));
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
|
| 287 |
+
_CG_ASSERT(is_valid());
|
| 288 |
+
return (details::multi_grid::num_grids(_handle));
|
| 289 |
+
}
|
| 290 |
+
};
|
| 291 |
+
# endif
|
| 292 |
+
|
| 293 |
+
/**
|
| 294 |
+
* multi_grid_group this_multi_grid()
|
| 295 |
+
*
|
| 296 |
+
* Constructs a multi_grid_group
|
| 297 |
+
*/
|
| 298 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 299 |
+
template <typename>
|
| 300 |
+
__device__
|
| 301 |
+
#else
|
| 302 |
+
_CG_QUALIFIER
|
| 303 |
+
# endif
|
| 304 |
+
_CG_DEPRECATED
|
| 305 |
+
multi_grid_group this_multi_grid()
|
| 306 |
+
{
|
| 307 |
+
return multi_grid_group();
|
| 308 |
+
}
|
| 309 |
+
#endif
|
| 310 |
+
|
| 311 |
+
/**
|
| 312 |
+
* class grid_group;
|
| 313 |
+
*
|
| 314 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 315 |
+
* same device within the same launched kernel. To use this group, the kernel
|
| 316 |
+
* must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
|
| 317 |
+
* and the device must support it (queryable device attribute).
|
| 318 |
+
*
|
| 319 |
+
* Constructed via this_grid();
|
| 320 |
+
*/
|
| 321 |
+
class grid_group : public thread_group_base<details::grid_group_id>
|
| 322 |
+
{
|
| 323 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
|
| 324 |
+
friend _CG_QUALIFIER grid_group this_grid();
|
| 325 |
+
|
| 326 |
+
private:
|
| 327 |
+
_CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
|
| 328 |
+
_data.grid.gridWs = gridWs;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
public:
|
| 332 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 333 |
+
|
| 334 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 335 |
+
return (_data.grid.gridWs != NULL);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
_CG_QUALIFIER void sync() const {
|
| 339 |
+
if (!is_valid()) {
|
| 340 |
+
_CG_ABORT();
|
| 341 |
+
}
|
| 342 |
+
details::grid::sync(&_data.grid.gridWs->barrier);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 346 |
+
using arrival_token = unsigned int;
|
| 347 |
+
|
| 348 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 349 |
+
if (!is_valid()) {
|
| 350 |
+
_CG_ABORT();
|
| 351 |
+
}
|
| 352 |
+
return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
|
| 356 |
+
details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
|
| 357 |
+
}
|
| 358 |
+
#endif
|
| 359 |
+
|
| 360 |
+
_CG_STATIC_QUALIFIER unsigned long long size() {
|
| 361 |
+
return details::grid::size();
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 365 |
+
return details::grid::grid_dim();
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 369 |
+
return details::grid::dim_threads();
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads() {
|
| 373 |
+
return details::grid::num_threads();
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 377 |
+
return details::grid::thread_index();
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank() {
|
| 381 |
+
return details::grid::thread_rank();
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks() {
|
| 385 |
+
return details::grid::dim_blocks();
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks() {
|
| 389 |
+
return details::grid::num_blocks();
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER dim3 block_index() {
|
| 393 |
+
return details::grid::block_index();
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank() {
|
| 397 |
+
return details::grid::block_rank();
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
# if defined(_CG_HAS_CLUSTER_GROUP)
|
| 401 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 402 |
+
return details::grid::dim_clusters();
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 406 |
+
return details::grid::num_clusters();
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 410 |
+
return details::grid::cluster_index();
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 414 |
+
return details::grid::cluster_rank();
|
| 415 |
+
}
|
| 416 |
+
# endif
|
| 417 |
+
};
|
| 418 |
+
|
| 419 |
+
_CG_QUALIFIER grid_group this_grid() {
|
| 420 |
+
// Load a workspace from the driver
|
| 421 |
+
grid_group gg(details::get_grid_workspace());
|
| 422 |
+
#ifdef _CG_DEBUG
|
| 423 |
+
// *all* threads must be available to synchronize
|
| 424 |
+
gg.sync();
|
| 425 |
+
#endif // _CG_DEBUG
|
| 426 |
+
return gg;
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
/**
|
| 431 |
+
* class cluster_group
|
| 432 |
+
*
|
| 433 |
+
* Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
|
| 434 |
+
* divided along all dimensions to form groups of blocks, each group of which is
|
| 435 |
+
* a block cluster. Clustered grids are subject to various restrictions and
|
| 436 |
+
* limitations. Primarily, a cluster consists of at most 8 blocks by default
|
| 437 |
+
* (although the user is allowed to opt-in to non-standard sizes,) and clustered
|
| 438 |
+
* grids are subject to additional occupancy limitations due to per-cluster
|
| 439 |
+
* hardware resource consumption. In exchange, a block cluster is guaranteed to
|
| 440 |
+
* be a cooperative group, with access to all cooperative group capabilities, as
|
| 441 |
+
* well as cluster specific capabilities and accelerations. A cluster_group
|
| 442 |
+
* represents a block cluster.
|
| 443 |
+
*
|
| 444 |
+
* Constructed via this_cluster_group();
|
| 445 |
+
*/
|
| 446 |
+
class cluster_group : public thread_group_base<details::cluster_group_id>
|
| 447 |
+
{
|
| 448 |
+
// Friends
|
| 449 |
+
friend _CG_QUALIFIER cluster_group this_cluster();
|
| 450 |
+
|
| 451 |
+
// Disable constructor
|
| 452 |
+
_CG_QUALIFIER cluster_group()
|
| 453 |
+
{
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
public:
|
| 457 |
+
//_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
|
| 458 |
+
|
| 459 |
+
using arrival_token = struct {};
|
| 460 |
+
|
| 461 |
+
// Functionality exposed by the group
|
| 462 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 463 |
+
{
|
| 464 |
+
return details::cluster::sync();
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
_CG_STATIC_QUALIFIER arrival_token barrier_arrive()
|
| 468 |
+
{
|
| 469 |
+
details::cluster::barrier_arrive();
|
| 470 |
+
return arrival_token();
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 474 |
+
{
|
| 475 |
+
return details::cluster::barrier_wait();
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
_CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
|
| 479 |
+
{
|
| 480 |
+
return details::cluster::barrier_wait();
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 484 |
+
{
|
| 485 |
+
return details::cluster::query_shared_rank(addr);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
template <typename T>
|
| 489 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 490 |
+
{
|
| 491 |
+
return details::cluster::map_shared_rank(addr, rank);
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 495 |
+
{
|
| 496 |
+
return details::cluster::block_index();
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 500 |
+
{
|
| 501 |
+
return details::cluster::block_rank();
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 505 |
+
{
|
| 506 |
+
return details::cluster::thread_index();
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 510 |
+
{
|
| 511 |
+
return details::cluster::thread_rank();
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 515 |
+
{
|
| 516 |
+
return details::cluster::dim_blocks();
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 520 |
+
{
|
| 521 |
+
return details::cluster::num_blocks();
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 525 |
+
{
|
| 526 |
+
return details::cluster::dim_threads();
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 530 |
+
{
|
| 531 |
+
return details::cluster::num_threads();
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
// Legacy aliases
|
| 535 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 536 |
+
{
|
| 537 |
+
return num_threads();
|
| 538 |
+
}
|
| 539 |
+
};
|
| 540 |
+
|
| 541 |
+
/*
|
| 542 |
+
* cluster_group this_cluster()
|
| 543 |
+
*
|
| 544 |
+
* Constructs a cluster_group
|
| 545 |
+
*/
|
| 546 |
+
_CG_QUALIFIER cluster_group this_cluster()
|
| 547 |
+
{
|
| 548 |
+
cluster_group cg;
|
| 549 |
+
#ifdef _CG_DEBUG
|
| 550 |
+
cg.sync();
|
| 551 |
+
#endif
|
| 552 |
+
return cg;
|
| 553 |
+
}
|
| 554 |
+
#endif
|
| 555 |
+
|
| 556 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 557 |
+
class thread_block;
|
| 558 |
+
template <unsigned int MaxBlockSize>
|
| 559 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 560 |
+
#endif
|
| 561 |
+
|
| 562 |
+
/**
|
| 563 |
+
* class thread_block
|
| 564 |
+
*
|
| 565 |
+
* Every GPU kernel is executed by a grid of thread blocks, and threads within
|
| 566 |
+
* each block are guaranteed to reside on the same streaming multiprocessor.
|
| 567 |
+
* A thread_block represents a thread block whose dimensions are not known until runtime.
|
| 568 |
+
*
|
| 569 |
+
* Constructed via this_thread_block();
|
| 570 |
+
*/
|
| 571 |
+
class thread_block : public thread_group_base<details::thread_block_id>
|
| 572 |
+
{
|
| 573 |
+
// Friends
|
| 574 |
+
friend _CG_QUALIFIER thread_block this_thread_block();
|
| 575 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 576 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
|
| 577 |
+
|
| 578 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 579 |
+
template <unsigned int MaxBlockSize>
|
| 580 |
+
friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 581 |
+
template <unsigned int Size>
|
| 582 |
+
friend class __static_size_multi_warp_tile_base;
|
| 583 |
+
|
| 584 |
+
details::multi_warp_scratch* const tile_memory;
|
| 585 |
+
|
| 586 |
+
template <unsigned int MaxBlockSize>
|
| 587 |
+
_CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
|
| 588 |
+
tile_memory(details::get_scratch_ptr(&scratch)) {
|
| 589 |
+
#ifdef _CG_DEBUG
|
| 590 |
+
if (num_threads() > MaxBlockSize) {
|
| 591 |
+
details::abort();
|
| 592 |
+
}
|
| 593 |
+
#endif
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
#if defined(_CG_USER_PROVIDED_SHARED_MEMORY)
|
| 597 |
+
#define _CG_SKIP_BARRIER_INIT_TARGET NV_NO_TARGET
|
| 598 |
+
#else
|
| 599 |
+
#define _CG_SKIP_BARRIER_INIT_TARGET NV_PROVIDES_SM_80
|
| 600 |
+
#endif
|
| 601 |
+
NV_IF_ELSE_TARGET(
|
| 602 |
+
_CG_SKIP_BARRIER_INIT_TARGET,
|
| 603 |
+
// skip if clause
|
| 604 |
+
,
|
| 605 |
+
(tile_memory->init_barriers(thread_rank());
|
| 606 |
+
sync();)
|
| 607 |
+
)
|
| 608 |
+
}
|
| 609 |
+
#endif
|
| 610 |
+
#undef _CG_SKIP_BARRIER_INIT_TARGET
|
| 611 |
+
|
| 612 |
+
// Disable constructor
|
| 613 |
+
_CG_QUALIFIER thread_block()
|
| 614 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 615 |
+
: tile_memory(details::get_scratch_ptr(NULL))
|
| 616 |
+
#endif
|
| 617 |
+
{ }
|
| 618 |
+
|
| 619 |
+
// Internal Use
|
| 620 |
+
_CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
|
| 621 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 622 |
+
|
| 623 |
+
// Invalid, immediately fail
|
| 624 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 625 |
+
details::abort();
|
| 626 |
+
return (thread_block());
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
unsigned int mask;
|
| 630 |
+
unsigned int base_offset = thread_rank() & (~(tilesz - 1));
|
| 631 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 632 |
+
|
| 633 |
+
mask = (unsigned int)(-1) >> (32 - masklength);
|
| 634 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 635 |
+
thread_group tile = thread_group(details::coalesced_group_id);
|
| 636 |
+
tile._data.coalesced.mask = mask;
|
| 637 |
+
tile._data.coalesced.size = __popc(mask);
|
| 638 |
+
tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
|
| 639 |
+
tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
|
| 640 |
+
tile._data.coalesced.is_tiled = true;
|
| 641 |
+
return (tile);
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
public:
|
| 645 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
|
| 646 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 647 |
+
|
| 648 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 649 |
+
details::cta::sync();
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 653 |
+
struct arrival_token {};
|
| 654 |
+
|
| 655 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 656 |
+
return arrival_token();
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&&) const {
|
| 660 |
+
details::cta::sync();
|
| 661 |
+
}
|
| 662 |
+
#endif
|
| 663 |
+
|
| 664 |
+
_CG_STATIC_QUALIFIER unsigned int size() {
|
| 665 |
+
return details::cta::size();
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 669 |
+
return details::cta::thread_rank();
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
// Additional functionality exposed by the group
|
| 673 |
+
_CG_STATIC_QUALIFIER dim3 group_index() {
|
| 674 |
+
return details::cta::group_index();
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 678 |
+
return details::cta::thread_index();
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 682 |
+
return details::cta::block_dim();
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 686 |
+
return details::cta::dim_threads();
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads() {
|
| 690 |
+
return details::cta::num_threads();
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
};
|
| 694 |
+
|
| 695 |
+
/**
|
| 696 |
+
* thread_block this_thread_block()
|
| 697 |
+
*
|
| 698 |
+
* Constructs a thread_block group
|
| 699 |
+
*/
|
| 700 |
+
_CG_QUALIFIER thread_block this_thread_block()
|
| 701 |
+
{
|
| 702 |
+
return (thread_block());
|
| 703 |
+
}
|
| 704 |
+
|
| 705 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 706 |
+
template <unsigned int MaxBlockSize>
|
| 707 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
|
| 708 |
+
return (thread_block(scratch));
|
| 709 |
+
}
|
| 710 |
+
#endif
|
| 711 |
+
|
| 712 |
+
/**
|
| 713 |
+
* class coalesced_group
|
| 714 |
+
*
|
| 715 |
+
* A group representing the current set of converged threads in a warp.
|
| 716 |
+
* The size of the group is not guaranteed and it may return a group of
|
| 717 |
+
* only one thread (itself).
|
| 718 |
+
*
|
| 719 |
+
* This group exposes warp-synchronous builtins.
|
| 720 |
+
* Constructed via coalesced_threads();
|
| 721 |
+
*/
|
| 722 |
+
class coalesced_group : public thread_group_base<details::coalesced_group_id>
|
| 723 |
+
{
|
| 724 |
+
private:
|
| 725 |
+
friend _CG_QUALIFIER coalesced_group coalesced_threads();
|
| 726 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 727 |
+
friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
|
| 728 |
+
friend class details::_coalesced_group_data_access;
|
| 729 |
+
|
| 730 |
+
_CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
|
| 731 |
+
unsigned int member_pack = 0;
|
| 732 |
+
unsigned int member_rank = 0;
|
| 733 |
+
for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 734 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 735 |
+
if (lane_bit) {
|
| 736 |
+
if (laneMask & lane_bit)
|
| 737 |
+
member_pack |= 1 << member_rank;
|
| 738 |
+
member_rank++;
|
| 739 |
+
}
|
| 740 |
+
}
|
| 741 |
+
return (member_pack);
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
// Internal Use
|
| 745 |
+
_CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
|
| 746 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 747 |
+
|
| 748 |
+
// Invalid, immediately fail
|
| 749 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 750 |
+
details::abort();
|
| 751 |
+
return (coalesced_group(0));
|
| 752 |
+
}
|
| 753 |
+
if (size() <= tilesz) {
|
| 754 |
+
return (*this);
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
|
| 758 |
+
unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
|
| 759 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 760 |
+
unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
|
| 761 |
+
|
| 762 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 763 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 764 |
+
coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
|
| 765 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 766 |
+
coalesced_tile._data.coalesced.is_tiled = true;
|
| 767 |
+
return (coalesced_tile);
|
| 768 |
+
}
|
| 769 |
+
else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
|
| 770 |
+
unsigned int mask = 0;
|
| 771 |
+
unsigned int member_rank = 0;
|
| 772 |
+
int seen_lanes = (thread_rank() / tilesz) * tilesz;
|
| 773 |
+
for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 774 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 775 |
+
if (lane_bit) {
|
| 776 |
+
if (seen_lanes <= 0 && member_rank < tilesz) {
|
| 777 |
+
mask |= lane_bit;
|
| 778 |
+
member_rank++;
|
| 779 |
+
}
|
| 780 |
+
seen_lanes--;
|
| 781 |
+
}
|
| 782 |
+
}
|
| 783 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 784 |
+
// Override parent with the size of this group
|
| 785 |
+
coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
|
| 786 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 787 |
+
return coalesced_tile;
|
| 788 |
+
}
|
| 789 |
+
else {
|
| 790 |
+
// None in _CG_VERSION 1000
|
| 791 |
+
details::abort();
|
| 792 |
+
}
|
| 793 |
+
|
| 794 |
+
return (coalesced_group(0));
|
| 795 |
+
}
|
| 796 |
+
|
| 797 |
+
protected:
|
| 798 |
+
_CG_QUALIFIER coalesced_group(unsigned int mask) {
|
| 799 |
+
_data.coalesced.mask = mask;
|
| 800 |
+
_data.coalesced.size = __popc(mask);
|
| 801 |
+
_data.coalesced.metaGroupRank = 0;
|
| 802 |
+
_data.coalesced.metaGroupSize = 1;
|
| 803 |
+
_data.coalesced.is_tiled = false;
|
| 804 |
+
}
|
| 805 |
+
|
| 806 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 807 |
+
return (_data.coalesced.mask);
|
| 808 |
+
}
|
| 809 |
+
|
| 810 |
+
public:
|
| 811 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 812 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 813 |
+
|
| 814 |
+
_CG_QUALIFIER unsigned int num_threads() const {
|
| 815 |
+
return _data.coalesced.size;
|
| 816 |
+
}
|
| 817 |
+
|
| 818 |
+
_CG_QUALIFIER unsigned int size() const {
|
| 819 |
+
return num_threads();
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
+
_CG_QUALIFIER unsigned int thread_rank() const {
|
| 823 |
+
return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
// Rank of this group in the upper level of the hierarchy
|
| 827 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 828 |
+
return _data.coalesced.metaGroupRank;
|
| 829 |
+
}
|
| 830 |
+
|
| 831 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 832 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 833 |
+
return _data.coalesced.metaGroupSize;
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
+
_CG_QUALIFIER void sync() const {
|
| 837 |
+
__syncwarp(_data.coalesced.mask);
|
| 838 |
+
}
|
| 839 |
+
|
| 840 |
+
#ifdef _CG_CPP11_FEATURES
|
| 841 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 842 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 843 |
+
unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 844 |
+
(size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
|
| 845 |
+
|
| 846 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 847 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 851 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 852 |
+
if (size() == 32) {
|
| 853 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 854 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 858 |
+
|
| 859 |
+
if (lane >= 32)
|
| 860 |
+
lane = details::laneid();
|
| 861 |
+
|
| 862 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 863 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 864 |
+
}
|
| 865 |
+
|
| 866 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 867 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
|
| 868 |
+
if (size() == 32) {
|
| 869 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 870 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 871 |
+
}
|
| 872 |
+
|
| 873 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 874 |
+
if (lane >= 32)
|
| 875 |
+
lane = details::laneid();
|
| 876 |
+
|
| 877 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 878 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 879 |
+
}
|
| 880 |
+
#else
|
| 881 |
+
template <typename TyIntegral>
|
| 882 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
|
| 883 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 884 |
+
unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 885 |
+
(size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
|
| 886 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
template <typename TyIntegral>
|
| 890 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
|
| 891 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 892 |
+
if (size() == 32) {
|
| 893 |
+
return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
|
| 894 |
+
}
|
| 895 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 896 |
+
if (lane >= 32) lane = details::laneid();
|
| 897 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 898 |
+
}
|
| 899 |
+
|
| 900 |
+
template <typename TyIntegral>
|
| 901 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
|
| 902 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 903 |
+
if (size() == 32) {
|
| 904 |
+
return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
|
| 905 |
+
}
|
| 906 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 907 |
+
if (lane >= 32) lane = details::laneid();
|
| 908 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 909 |
+
}
|
| 910 |
+
#endif
|
| 911 |
+
|
| 912 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 913 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
|
| 914 |
+
}
|
| 915 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 916 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
|
| 917 |
+
}
|
| 918 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 919 |
+
if (size() == 32) {
|
| 920 |
+
return (__ballot_sync(0xFFFFFFFF, predicate));
|
| 921 |
+
}
|
| 922 |
+
unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
|
| 923 |
+
return (_packLanes(lane_ballot));
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 927 |
+
|
| 928 |
+
template <typename TyIntegral>
|
| 929 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 930 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 931 |
+
if (size() == 32) {
|
| 932 |
+
return (__match_any_sync(0xFFFFFFFF, val));
|
| 933 |
+
}
|
| 934 |
+
unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
|
| 935 |
+
return (_packLanes(lane_match));
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
+
template <typename TyIntegral>
|
| 939 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 940 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 941 |
+
if (size() == 32) {
|
| 942 |
+
return (__match_all_sync(0xFFFFFFFF, val, &pred));
|
| 943 |
+
}
|
| 944 |
+
unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
|
| 945 |
+
return (_packLanes(lane_match));
|
| 946 |
+
}
|
| 947 |
+
|
| 948 |
+
#endif /* !_CG_HAS_MATCH_COLLECTIVE */
|
| 949 |
+
|
| 950 |
+
};
|
| 951 |
+
|
| 952 |
+
_CG_QUALIFIER coalesced_group coalesced_threads()
|
| 953 |
+
{
|
| 954 |
+
return (coalesced_group(__activemask()));
|
| 955 |
+
}
|
| 956 |
+
|
| 957 |
+
namespace details {
|
| 958 |
+
template <unsigned int Size> struct verify_thread_block_tile_size;
|
| 959 |
+
template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
|
| 960 |
+
template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
|
| 961 |
+
template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
|
| 962 |
+
template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
|
| 963 |
+
template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
|
| 964 |
+
template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
|
| 965 |
+
|
| 966 |
+
#ifdef _CG_CPP11_FEATURES
|
| 967 |
+
template <unsigned int Size>
|
| 968 |
+
using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
|
| 969 |
+
|
| 970 |
+
template <unsigned int Size>
|
| 971 |
+
using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
|
| 972 |
+
template <unsigned int Size>
|
| 973 |
+
using _is_multi_warp =
|
| 974 |
+
_CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
|
| 975 |
+
|
| 976 |
+
template <unsigned int Size>
|
| 977 |
+
using _is_valid_single_warp_tile =
|
| 978 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
|
| 979 |
+
template <unsigned int Size>
|
| 980 |
+
using _is_valid_multi_warp_tile =
|
| 981 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
|
| 982 |
+
#else
|
| 983 |
+
template <unsigned int Size>
|
| 984 |
+
struct _is_multi_warp {
|
| 985 |
+
static const bool value = false;
|
| 986 |
+
};
|
| 987 |
+
#endif
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
template <unsigned int Size>
|
| 991 |
+
class __static_size_tile_base
|
| 992 |
+
{
|
| 993 |
+
protected:
|
| 994 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 995 |
+
|
| 996 |
+
public:
|
| 997 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 998 |
+
|
| 999 |
+
// Rank of thread within tile
|
| 1000 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 1001 |
+
return (details::cta::thread_rank() & (numThreads - 1));
|
| 1002 |
+
}
|
| 1003 |
+
|
| 1004 |
+
// Number of threads within tile
|
| 1005 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
|
| 1006 |
+
return numThreads;
|
| 1007 |
+
}
|
| 1008 |
+
|
| 1009 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
|
| 1010 |
+
return num_threads();
|
| 1011 |
+
}
|
| 1012 |
+
};
|
| 1013 |
+
|
| 1014 |
+
template <unsigned int Size>
|
| 1015 |
+
class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
|
| 1016 |
+
{
|
| 1017 |
+
friend class details::_coalesced_group_data_access;
|
| 1018 |
+
typedef details::tile::tile_helpers<Size> th;
|
| 1019 |
+
|
| 1020 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1021 |
+
static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
|
| 1022 |
+
#else
|
| 1023 |
+
typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
|
| 1024 |
+
#endif
|
| 1025 |
+
using __static_size_tile_base<Size>::numThreads;
|
| 1026 |
+
_CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
|
| 1027 |
+
|
| 1028 |
+
protected:
|
| 1029 |
+
_CG_STATIC_QUALIFIER unsigned int build_mask() {
|
| 1030 |
+
unsigned int mask = fullMask;
|
| 1031 |
+
if (numThreads != 32) {
|
| 1032 |
+
// [0,31] representing the current active thread in the warp
|
| 1033 |
+
unsigned int laneId = details::laneid();
|
| 1034 |
+
// shift mask according to the partition it belongs to
|
| 1035 |
+
mask = th::tileMask << (laneId & ~(th::laneMask));
|
| 1036 |
+
}
|
| 1037 |
+
return (mask);
|
| 1038 |
+
}
|
| 1039 |
+
|
| 1040 |
+
public:
|
| 1041 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 1042 |
+
|
| 1043 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 1044 |
+
__syncwarp(build_mask());
|
| 1045 |
+
}
|
| 1046 |
+
|
| 1047 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1048 |
+
// PTX supported collectives
|
| 1049 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1050 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 1051 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 1052 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
|
| 1053 |
+
}
|
| 1054 |
+
|
| 1055 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1056 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 1057 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 1058 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1059 |
+
}
|
| 1060 |
+
|
| 1061 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1062 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
|
| 1063 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 1064 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1065 |
+
}
|
| 1066 |
+
|
| 1067 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1068 |
+
_CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
|
| 1069 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
|
| 1070 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
|
| 1071 |
+
}
|
| 1072 |
+
#else
|
| 1073 |
+
template <typename TyIntegral>
|
| 1074 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
|
| 1075 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1076 |
+
return (__shfl_sync(build_mask(), var, srcRank, numThreads));
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
template <typename TyIntegral>
|
| 1080 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
|
| 1081 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1082 |
+
return (__shfl_down_sync(build_mask(), var, delta, numThreads));
|
| 1083 |
+
}
|
| 1084 |
+
|
| 1085 |
+
template <typename TyIntegral>
|
| 1086 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
|
| 1087 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1088 |
+
return (__shfl_up_sync(build_mask(), var, delta, numThreads));
|
| 1089 |
+
}
|
| 1090 |
+
|
| 1091 |
+
template <typename TyIntegral>
|
| 1092 |
+
_CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
|
| 1093 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1094 |
+
return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
|
| 1095 |
+
}
|
| 1096 |
+
#endif //_CG_CPP11_FEATURES
|
| 1097 |
+
|
| 1098 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1099 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1100 |
+
return (lane_ballot != 0);
|
| 1101 |
+
}
|
| 1102 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1103 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1104 |
+
return (lane_ballot == build_mask());
|
| 1105 |
+
}
|
| 1106 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 1107 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1108 |
+
return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
|
| 1109 |
+
}
|
| 1110 |
+
|
| 1111 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 1112 |
+
template <typename TyIntegral>
|
| 1113 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 1114 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1115 |
+
unsigned int lane_match = __match_any_sync(build_mask(), val);
|
| 1116 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1117 |
+
}
|
| 1118 |
+
|
| 1119 |
+
template <typename TyIntegral>
|
| 1120 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 1121 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1122 |
+
unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
|
| 1123 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1124 |
+
}
|
| 1125 |
+
#endif
|
| 1126 |
+
|
| 1127 |
+
};
|
| 1128 |
+
|
| 1129 |
+
template <unsigned int Size, typename ParentT>
|
| 1130 |
+
class __static_parent_thread_block_tile_base
|
| 1131 |
+
{
|
| 1132 |
+
public:
|
| 1133 |
+
// Rank of this group in the upper level of the hierarchy
|
| 1134 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
|
| 1135 |
+
return ParentT::thread_rank() / Size;
|
| 1136 |
+
}
|
| 1137 |
+
|
| 1138 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 1139 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_size() {
|
| 1140 |
+
return (ParentT::size() + Size - 1) / Size;
|
| 1141 |
+
}
|
| 1142 |
+
};
|
| 1143 |
+
|
| 1144 |
+
/**
|
| 1145 |
+
* class thread_block_tile<unsigned int Size, ParentT = void>
|
| 1146 |
+
*
|
| 1147 |
+
* Statically-sized group type, representing one tile of a thread block.
|
| 1148 |
+
* The only specializations currently supported are those with native
|
| 1149 |
+
* hardware support (1/2/4/8/16/32)
|
| 1150 |
+
*
|
| 1151 |
+
* This group exposes warp-synchronous builtins.
|
| 1152 |
+
* Can only be constructed via tiled_partition<Size>(ParentT&)
|
| 1153 |
+
*/
|
| 1154 |
+
|
| 1155 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1156 |
+
class __single_warp_thread_block_tile :
|
| 1157 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1158 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1159 |
+
{
|
| 1160 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1161 |
+
friend class details::_coalesced_group_data_access;
|
| 1162 |
+
|
| 1163 |
+
protected:
|
| 1164 |
+
_CG_QUALIFIER __single_warp_thread_block_tile() { };
|
| 1165 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
|
| 1166 |
+
|
| 1167 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask() {
|
| 1168 |
+
return __static_size_thread_block_tile_base<Size>::build_mask();
|
| 1169 |
+
}
|
| 1170 |
+
};
|
| 1171 |
+
|
| 1172 |
+
template <unsigned int Size>
|
| 1173 |
+
class __single_warp_thread_block_tile<Size, void> :
|
| 1174 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1175 |
+
public thread_group_base<details::coalesced_group_id>
|
| 1176 |
+
{
|
| 1177 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 1178 |
+
|
| 1179 |
+
template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
|
| 1180 |
+
friend class details::_coalesced_group_data_access;
|
| 1181 |
+
|
| 1182 |
+
typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
|
| 1183 |
+
|
| 1184 |
+
protected:
|
| 1185 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
|
| 1186 |
+
_data.coalesced.mask = staticSizeBaseT::build_mask();
|
| 1187 |
+
_data.coalesced.size = numThreads;
|
| 1188 |
+
_data.coalesced.metaGroupRank = meta_group_rank;
|
| 1189 |
+
_data.coalesced.metaGroupSize = meta_group_size;
|
| 1190 |
+
_data.coalesced.is_tiled = true;
|
| 1191 |
+
}
|
| 1192 |
+
|
| 1193 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 1194 |
+
return (_data.coalesced.mask);
|
| 1195 |
+
}
|
| 1196 |
+
|
| 1197 |
+
public:
|
| 1198 |
+
using staticSizeBaseT::sync;
|
| 1199 |
+
using staticSizeBaseT::size;
|
| 1200 |
+
using staticSizeBaseT::num_threads;
|
| 1201 |
+
using staticSizeBaseT::thread_rank;
|
| 1202 |
+
|
| 1203 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1204 |
+
return _data.coalesced.metaGroupRank;
|
| 1205 |
+
}
|
| 1206 |
+
|
| 1207 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1208 |
+
return _data.coalesced.metaGroupSize;
|
| 1209 |
+
}
|
| 1210 |
+
};
|
| 1211 |
+
|
| 1212 |
+
/**
|
| 1213 |
+
* Outer level API calls
|
| 1214 |
+
* void sync(GroupT) - see <group_type>.sync()
|
| 1215 |
+
* void thread_rank(GroupT) - see <group_type>.thread_rank()
|
| 1216 |
+
* void group_size(GroupT) - see <group_type>.size()
|
| 1217 |
+
*/
|
| 1218 |
+
template <class GroupT>
|
| 1219 |
+
_CG_QUALIFIER void sync(GroupT const &g)
|
| 1220 |
+
{
|
| 1221 |
+
g.sync();
|
| 1222 |
+
}
|
| 1223 |
+
|
| 1224 |
+
// TODO: Use a static dispatch to determine appropriate return type
|
| 1225 |
+
// C++03 is stuck with unsigned long long for now
|
| 1226 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1227 |
+
template <class GroupT>
|
| 1228 |
+
_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
|
| 1229 |
+
return g.thread_rank();
|
| 1230 |
+
}
|
| 1231 |
+
|
| 1232 |
+
|
| 1233 |
+
template <class GroupT>
|
| 1234 |
+
_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
|
| 1235 |
+
return g.num_threads();
|
| 1236 |
+
}
|
| 1237 |
+
#else
|
| 1238 |
+
template <class GroupT>
|
| 1239 |
+
_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
|
| 1240 |
+
return static_cast<unsigned long long>(g.thread_rank());
|
| 1241 |
+
}
|
| 1242 |
+
|
| 1243 |
+
|
| 1244 |
+
template <class GroupT>
|
| 1245 |
+
_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
|
| 1246 |
+
return static_cast<unsigned long long>(g.num_threads());
|
| 1247 |
+
}
|
| 1248 |
+
#endif
|
| 1249 |
+
|
| 1250 |
+
|
| 1251 |
+
/**
|
| 1252 |
+
* tiled_partition
|
| 1253 |
+
*
|
| 1254 |
+
* The tiled_partition(parent, tilesz) method is a collective operation that
|
| 1255 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1256 |
+
*
|
| 1257 |
+
* A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
|
| 1258 |
+
* be created where threads having identical k = (thread_rank(parent)/tilesz)
|
| 1259 |
+
* will be members of the same subgroup.
|
| 1260 |
+
*
|
| 1261 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1262 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1263 |
+
*
|
| 1264 |
+
* Functionality is limited to power-of-two sized subgorup instances of at most
|
| 1265 |
+
* 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
|
| 1266 |
+
* tiled_partition() in _CG_VERSION 1000.
|
| 1267 |
+
*/
|
| 1268 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
|
| 1269 |
+
{
|
| 1270 |
+
if (parent.get_type() == details::coalesced_group_id) {
|
| 1271 |
+
const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
|
| 1272 |
+
return _cg->_get_tiled_threads(tilesz);
|
| 1273 |
+
}
|
| 1274 |
+
else {
|
| 1275 |
+
const thread_block *_tb = static_cast<const thread_block*>(&parent);
|
| 1276 |
+
return _tb->_get_tiled_threads(tilesz);
|
| 1277 |
+
}
|
| 1278 |
+
}
|
| 1279 |
+
|
| 1280 |
+
// Thread block type overload: returns a basic thread_group for now (may be specialized later)
|
| 1281 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
|
| 1282 |
+
{
|
| 1283 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1284 |
+
}
|
| 1285 |
+
|
| 1286 |
+
// Coalesced group type overload: retains its ability to stay coalesced
|
| 1287 |
+
_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
|
| 1288 |
+
{
|
| 1289 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1290 |
+
}
|
| 1291 |
+
|
| 1292 |
+
namespace details {
|
| 1293 |
+
template <unsigned int Size, typename ParentT>
|
| 1294 |
+
class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
|
| 1295 |
+
|
| 1296 |
+
template <unsigned int Size, typename ParentT>
|
| 1297 |
+
_CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
|
| 1298 |
+
return internal_thread_block_tile<Size, ParentT>();
|
| 1299 |
+
}
|
| 1300 |
+
|
| 1301 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1302 |
+
_CG_QUALIFIER TyVal multi_warp_collectives_helper(
|
| 1303 |
+
const GroupT& group,
|
| 1304 |
+
WarpLambda warp_lambda,
|
| 1305 |
+
InterWarpLambda inter_warp_lambda) {
|
| 1306 |
+
return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
|
| 1307 |
+
}
|
| 1308 |
+
|
| 1309 |
+
template <typename T, typename GroupT>
|
| 1310 |
+
_CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
|
| 1311 |
+
return group.template get_scratch_location<T>(warp_id);
|
| 1312 |
+
}
|
| 1313 |
+
|
| 1314 |
+
template <typename GroupT>
|
| 1315 |
+
_CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
|
| 1316 |
+
return group.get_sync_location();
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
+
}
|
| 1320 |
+
/**
|
| 1321 |
+
* tiled_partition<tilesz>
|
| 1322 |
+
*
|
| 1323 |
+
* The tiled_partition<tilesz>(parent) method is a collective operation that
|
| 1324 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1325 |
+
*
|
| 1326 |
+
* A total of ((size(parent)/tilesz) subgroups will be created,
|
| 1327 |
+
* therefore the parent group size must be evenly divisible by the tilesz.
|
| 1328 |
+
* The allow parent groups are thread_block or thread_block_tile<size>.
|
| 1329 |
+
*
|
| 1330 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1331 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1332 |
+
*
|
| 1333 |
+
* Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
|
| 1334 |
+
* The size(parent) must be greater than the template Size parameter
|
| 1335 |
+
* otherwise the results are undefined.
|
| 1336 |
+
*/
|
| 1337 |
+
|
| 1338 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1339 |
+
template <unsigned int Size>
|
| 1340 |
+
class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
|
| 1341 |
+
{
|
| 1342 |
+
static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
|
| 1343 |
+
|
| 1344 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1345 |
+
friend __device__ TyVal details::multi_warp_collectives_helper(
|
| 1346 |
+
const GroupT& group,
|
| 1347 |
+
WarpLambda warp_lambda,
|
| 1348 |
+
InterWarpLambda inter_warp_lambda);
|
| 1349 |
+
template <typename T, typename GroupT>
|
| 1350 |
+
friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
|
| 1351 |
+
template <typename GroupT>
|
| 1352 |
+
friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
|
| 1353 |
+
template <unsigned int OtherSize>
|
| 1354 |
+
friend class __static_size_multi_warp_tile_base;
|
| 1355 |
+
using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 1356 |
+
using ThisType = __static_size_multi_warp_tile_base<Size>;
|
| 1357 |
+
_CG_STATIC_CONST_DECL int numWarps = Size / 32;
|
| 1358 |
+
|
| 1359 |
+
protected:
|
| 1360 |
+
details::multi_warp_scratch* const tile_memory;
|
| 1361 |
+
|
| 1362 |
+
template <typename GroupT>
|
| 1363 |
+
_CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
|
| 1364 |
+
#if !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
|
| 1365 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 1366 |
+
details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
|
| 1367 |
+
g.sync();
|
| 1368 |
+
)
|
| 1369 |
+
#endif
|
| 1370 |
+
}
|
| 1371 |
+
|
| 1372 |
+
|
| 1373 |
+
private:
|
| 1374 |
+
_CG_QUALIFIER details::barrier_t* get_sync_location() const {
|
| 1375 |
+
// Different group sizes use different barriers, all groups of a given size share one barrier.
|
| 1376 |
+
unsigned int sync_id = details::log2(Size / 64);
|
| 1377 |
+
return &tile_memory->barriers[sync_id];
|
| 1378 |
+
}
|
| 1379 |
+
|
| 1380 |
+
template <typename T>
|
| 1381 |
+
_CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
|
| 1382 |
+
unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
|
| 1383 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1384 |
+
}
|
| 1385 |
+
|
| 1386 |
+
template <typename T>
|
| 1387 |
+
_CG_QUALIFIER T* get_scratch_location() const {
|
| 1388 |
+
unsigned int scratch_id = details::cta::thread_rank() / 32;
|
| 1389 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1390 |
+
}
|
| 1391 |
+
|
| 1392 |
+
template <typename TyVal>
|
| 1393 |
+
_CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
|
| 1394 |
+
unsigned int src_warp = src / 32;
|
| 1395 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1396 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1397 |
+
|
| 1398 |
+
// Get warp slot of the source threads warp.
|
| 1399 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
|
| 1400 |
+
|
| 1401 |
+
if (warp.meta_group_rank() == src_warp) {
|
| 1402 |
+
warp.sync();
|
| 1403 |
+
// Put shuffled value into my warp slot and let my warp arrive at the barrier.
|
| 1404 |
+
if (thread_rank() == src) {
|
| 1405 |
+
*warp_scratch_location = val;
|
| 1406 |
+
}
|
| 1407 |
+
details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
|
| 1408 |
+
TyVal result = *warp_scratch_location;
|
| 1409 |
+
details::sync_warps_wait(sync_location, details::cta::thread_rank());
|
| 1410 |
+
return result;
|
| 1411 |
+
}
|
| 1412 |
+
else {
|
| 1413 |
+
// Wait for the source warp to arrive on the barrier.
|
| 1414 |
+
details::sync_warps_wait_for_specific_warp(sync_location,
|
| 1415 |
+
(details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
|
| 1416 |
+
TyVal result = *warp_scratch_location;
|
| 1417 |
+
details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
|
| 1418 |
+
return result;
|
| 1419 |
+
}
|
| 1420 |
+
}
|
| 1421 |
+
|
| 1422 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1423 |
+
_CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
|
| 1424 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1425 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1426 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1427 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1428 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>();
|
| 1429 |
+
|
| 1430 |
+
warp_lambda(warp, warp_scratch_location);
|
| 1431 |
+
|
| 1432 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
|
| 1433 |
+
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
|
| 1434 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 1435 |
+
TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
|
| 1436 |
+
inter_warp_lambda(subwarp, thread_scratch_location);
|
| 1437 |
+
}
|
| 1438 |
+
warp.sync();
|
| 1439 |
+
details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
|
| 1440 |
+
}
|
| 1441 |
+
TyVal result = *warp_scratch_location;
|
| 1442 |
+
return result;
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
public:
|
| 1446 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
|
| 1447 |
+
|
| 1448 |
+
using __static_size_tile_base<Size>::thread_rank;
|
| 1449 |
+
|
| 1450 |
+
template <typename TyVal>
|
| 1451 |
+
_CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
|
| 1452 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1453 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1454 |
+
return shfl_impl(val, src);
|
| 1455 |
+
}
|
| 1456 |
+
|
| 1457 |
+
_CG_QUALIFIER void sync() const {
|
| 1458 |
+
details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
|
| 1459 |
+
}
|
| 1460 |
+
|
| 1461 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1462 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1463 |
+
*warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
|
| 1464 |
+
};
|
| 1465 |
+
auto inter_warp_lambda =
|
| 1466 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1467 |
+
*thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1468 |
+
};
|
| 1469 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1470 |
+
}
|
| 1471 |
+
|
| 1472 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1473 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1474 |
+
*warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
|
| 1475 |
+
};
|
| 1476 |
+
auto inter_warp_lambda =
|
| 1477 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1478 |
+
*thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1479 |
+
};
|
| 1480 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1481 |
+
}
|
| 1482 |
+
};
|
| 1483 |
+
|
| 1484 |
+
|
| 1485 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1486 |
+
class __multi_warp_thread_block_tile :
|
| 1487 |
+
public __static_size_multi_warp_tile_base<Size>,
|
| 1488 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1489 |
+
{
|
| 1490 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1491 |
+
typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
|
| 1492 |
+
protected:
|
| 1493 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
|
| 1494 |
+
__static_size_multi_warp_tile_base<Size>(g) {}
|
| 1495 |
+
};
|
| 1496 |
+
|
| 1497 |
+
template <unsigned int Size>
|
| 1498 |
+
class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
|
| 1499 |
+
{
|
| 1500 |
+
const unsigned int metaGroupRank;
|
| 1501 |
+
const unsigned int metaGroupSize;
|
| 1502 |
+
|
| 1503 |
+
protected:
|
| 1504 |
+
template <unsigned int OtherSize, typename ParentT>
|
| 1505 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
|
| 1506 |
+
__static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
|
| 1507 |
+
|
| 1508 |
+
public:
|
| 1509 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1510 |
+
return metaGroupRank;
|
| 1511 |
+
}
|
| 1512 |
+
|
| 1513 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1514 |
+
return metaGroupSize;
|
| 1515 |
+
}
|
| 1516 |
+
};
|
| 1517 |
+
#endif
|
| 1518 |
+
|
| 1519 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1520 |
+
class thread_block_tile;
|
| 1521 |
+
|
| 1522 |
+
namespace details {
|
| 1523 |
+
template <unsigned int Size, typename ParentT, bool IsMultiWarp>
|
| 1524 |
+
class thread_block_tile_impl;
|
| 1525 |
+
|
| 1526 |
+
template <unsigned int Size, typename ParentT>
|
| 1527 |
+
class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
|
| 1528 |
+
{
|
| 1529 |
+
protected:
|
| 1530 |
+
template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
|
| 1531 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
|
| 1532 |
+
__single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
|
| 1533 |
+
|
| 1534 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
|
| 1535 |
+
__single_warp_thread_block_tile<Size, ParentT>() {}
|
| 1536 |
+
};
|
| 1537 |
+
|
| 1538 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1539 |
+
template <unsigned int Size, typename ParentT>
|
| 1540 |
+
class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
|
| 1541 |
+
{
|
| 1542 |
+
protected:
|
| 1543 |
+
template <typename GroupT>
|
| 1544 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
|
| 1545 |
+
__multi_warp_thread_block_tile<Size, ParentT>(g) {}
|
| 1546 |
+
};
|
| 1547 |
+
#else
|
| 1548 |
+
template <unsigned int Size, typename ParentT>
|
| 1549 |
+
class thread_block_tile_impl<Size, ParentT, true>
|
| 1550 |
+
{
|
| 1551 |
+
protected:
|
| 1552 |
+
template <typename GroupT>
|
| 1553 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
|
| 1554 |
+
};
|
| 1555 |
+
#endif
|
| 1556 |
+
}
|
| 1557 |
+
|
| 1558 |
+
template <unsigned int Size, typename ParentT>
|
| 1559 |
+
class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
|
| 1560 |
+
{
|
| 1561 |
+
friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
|
| 1562 |
+
|
| 1563 |
+
protected:
|
| 1564 |
+
_CG_QUALIFIER thread_block_tile(const ParentT& g) :
|
| 1565 |
+
details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
|
| 1566 |
+
|
| 1567 |
+
public:
|
| 1568 |
+
_CG_QUALIFIER operator thread_block_tile<Size, void>() const {
|
| 1569 |
+
return thread_block_tile<Size, void>(*this);
|
| 1570 |
+
}
|
| 1571 |
+
};
|
| 1572 |
+
|
| 1573 |
+
template <unsigned int Size>
|
| 1574 |
+
class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
|
| 1575 |
+
{
|
| 1576 |
+
template <unsigned int, typename ParentT>
|
| 1577 |
+
friend class thread_block_tile;
|
| 1578 |
+
|
| 1579 |
+
protected:
|
| 1580 |
+
template <unsigned int OtherSize, typename OtherParentT>
|
| 1581 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
|
| 1582 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1583 |
+
|
| 1584 |
+
public:
|
| 1585 |
+
template <typename ParentT>
|
| 1586 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
|
| 1587 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1588 |
+
};
|
| 1589 |
+
|
| 1590 |
+
namespace details {
|
| 1591 |
+
template <unsigned int Size, typename ParentT>
|
| 1592 |
+
struct tiled_partition_impl;
|
| 1593 |
+
|
| 1594 |
+
template <unsigned int Size>
|
| 1595 |
+
struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
|
| 1596 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
|
| 1597 |
+
thread_block_tile<Size, thread_block>(g) {}
|
| 1598 |
+
};
|
| 1599 |
+
|
| 1600 |
+
// ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
|
| 1601 |
+
template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
|
| 1602 |
+
struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
|
| 1603 |
+
public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
|
| 1604 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1605 |
+
static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
|
| 1606 |
+
#endif
|
| 1607 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
|
| 1608 |
+
thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
|
| 1609 |
+
};
|
| 1610 |
+
|
| 1611 |
+
}
|
| 1612 |
+
|
| 1613 |
+
template <unsigned int Size, typename ParentT>
|
| 1614 |
+
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
|
| 1615 |
+
{
|
| 1616 |
+
return details::tiled_partition_impl<Size, ParentT>(g);
|
| 1617 |
+
}
|
| 1618 |
+
|
| 1619 |
+
/**
|
| 1620 |
+
* thread_group this_thread()
|
| 1621 |
+
*
|
| 1622 |
+
* Constructs a generic thread_group containing only the calling thread
|
| 1623 |
+
*/
|
| 1624 |
+
_CG_QUALIFIER thread_block_tile<1, void> this_thread()
|
| 1625 |
+
{
|
| 1626 |
+
// Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
|
| 1627 |
+
// meta group rank and size set to 0 and 1 respectively.
|
| 1628 |
+
return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
|
| 1629 |
+
}
|
| 1630 |
+
|
| 1631 |
+
/**
|
| 1632 |
+
* <group_type>.sync()
|
| 1633 |
+
*
|
| 1634 |
+
* Executes a barrier across the group
|
| 1635 |
+
*
|
| 1636 |
+
* Implements both a compiler fence and an architectural fence to prevent,
|
| 1637 |
+
* memory reordering around the barrier.
|
| 1638 |
+
*/
|
| 1639 |
+
_CG_QUALIFIER void thread_group::sync() const
|
| 1640 |
+
{
|
| 1641 |
+
switch (_data.group.type) {
|
| 1642 |
+
case details::coalesced_group_id:
|
| 1643 |
+
cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
|
| 1644 |
+
break;
|
| 1645 |
+
case details::thread_block_id:
|
| 1646 |
+
cooperative_groups::sync(*static_cast<const thread_block*>(this));
|
| 1647 |
+
break;
|
| 1648 |
+
case details::grid_group_id:
|
| 1649 |
+
cooperative_groups::sync(*static_cast<const grid_group*>(this));
|
| 1650 |
+
break;
|
| 1651 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1652 |
+
case details::multi_grid_group_id:
|
| 1653 |
+
cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
|
| 1654 |
+
break;
|
| 1655 |
+
#endif
|
| 1656 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1657 |
+
case details::cluster_group_id:
|
| 1658 |
+
cooperative_groups::sync(*static_cast<const cluster_group*>(this));
|
| 1659 |
+
break;
|
| 1660 |
+
#endif
|
| 1661 |
+
default:
|
| 1662 |
+
break;
|
| 1663 |
+
}
|
| 1664 |
+
}
|
| 1665 |
+
|
| 1666 |
+
/**
|
| 1667 |
+
* <group_type>.size()
|
| 1668 |
+
*
|
| 1669 |
+
* Returns the total number of threads in the group.
|
| 1670 |
+
*/
|
| 1671 |
+
_CG_QUALIFIER unsigned long long thread_group::size() const
|
| 1672 |
+
{
|
| 1673 |
+
unsigned long long size = 0;
|
| 1674 |
+
switch (_data.group.type) {
|
| 1675 |
+
case details::coalesced_group_id:
|
| 1676 |
+
size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
|
| 1677 |
+
break;
|
| 1678 |
+
case details::thread_block_id:
|
| 1679 |
+
size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
|
| 1680 |
+
break;
|
| 1681 |
+
case details::grid_group_id:
|
| 1682 |
+
size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
|
| 1683 |
+
break;
|
| 1684 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1685 |
+
case details::multi_grid_group_id:
|
| 1686 |
+
size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
|
| 1687 |
+
break;
|
| 1688 |
+
#endif
|
| 1689 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1690 |
+
case details::cluster_group_id:
|
| 1691 |
+
size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
|
| 1692 |
+
break;
|
| 1693 |
+
#endif
|
| 1694 |
+
default:
|
| 1695 |
+
break;
|
| 1696 |
+
}
|
| 1697 |
+
return size;
|
| 1698 |
+
}
|
| 1699 |
+
|
| 1700 |
+
/**
|
| 1701 |
+
* <group_type>.thread_rank()
|
| 1702 |
+
*
|
| 1703 |
+
* Returns the linearized rank of the calling thread along the interval [0, size()).
|
| 1704 |
+
*/
|
| 1705 |
+
_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
|
| 1706 |
+
{
|
| 1707 |
+
unsigned long long rank = 0;
|
| 1708 |
+
switch (_data.group.type) {
|
| 1709 |
+
case details::coalesced_group_id:
|
| 1710 |
+
rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
|
| 1711 |
+
break;
|
| 1712 |
+
case details::thread_block_id:
|
| 1713 |
+
rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
|
| 1714 |
+
break;
|
| 1715 |
+
case details::grid_group_id:
|
| 1716 |
+
rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
|
| 1717 |
+
break;
|
| 1718 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1719 |
+
case details::multi_grid_group_id:
|
| 1720 |
+
rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
|
| 1721 |
+
break;
|
| 1722 |
+
#endif
|
| 1723 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1724 |
+
case details::cluster_group_id:
|
| 1725 |
+
rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
|
| 1726 |
+
break;
|
| 1727 |
+
#endif
|
| 1728 |
+
default:
|
| 1729 |
+
break;
|
| 1730 |
+
}
|
| 1731 |
+
return rank;
|
| 1732 |
+
}
|
| 1733 |
+
|
| 1734 |
+
_CG_END_NAMESPACE
|
| 1735 |
+
|
| 1736 |
+
#include <cooperative_groups/details/partitioning.h>
|
| 1737 |
+
#if (!defined(_MSC_VER) || defined(_WIN64))
|
| 1738 |
+
# include <cooperative_groups/details/invoke.h>
|
| 1739 |
+
#endif
|
| 1740 |
+
|
| 1741 |
+
# endif /* ! (__cplusplus, __CUDACC__) */
|
| 1742 |
+
|
| 1743 |
+
#endif /* !_COOPERATIVE_GROUPS_H_ */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_ASYNC_H
|
| 50 |
+
#define _CG_ASYNC_H
|
| 51 |
+
|
| 52 |
+
#include "helpers.h"
|
| 53 |
+
#include "info.h"
|
| 54 |
+
|
| 55 |
+
#include <cuda_pipeline.h>
|
| 56 |
+
|
| 57 |
+
_CG_BEGIN_NAMESPACE
|
| 58 |
+
|
| 59 |
+
namespace details {
|
| 60 |
+
// Groups supported by memcpy_async
|
| 61 |
+
template <class TyGroup>
|
| 62 |
+
struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 63 |
+
|
| 64 |
+
template <unsigned int Sz, typename TyPar>
|
| 65 |
+
struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
|
| 66 |
+
: public _CG_STL_NAMESPACE::true_type {};
|
| 67 |
+
template <>
|
| 68 |
+
struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 69 |
+
template <>
|
| 70 |
+
struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
|
| 71 |
+
|
| 72 |
+
template <class TyGroup>
|
| 73 |
+
using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
|
| 74 |
+
|
| 75 |
+
// Groups that require optimization
|
| 76 |
+
template <class TyGroup>
|
| 77 |
+
struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
|
| 78 |
+
|
| 79 |
+
template <typename TyPar>
|
| 80 |
+
struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
|
| 81 |
+
: public _CG_STL_NAMESPACE::false_type {};
|
| 82 |
+
|
| 83 |
+
template <unsigned int Sz, typename TyPar>
|
| 84 |
+
struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
|
| 85 |
+
: public _CG_STL_NAMESPACE::true_type {};
|
| 86 |
+
|
| 87 |
+
template <class TyGroup>
|
| 88 |
+
using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
|
| 89 |
+
|
| 90 |
+
// SFINAE helpers for tile optimizations
|
| 91 |
+
template <class TyGroup>
|
| 92 |
+
using enable_tile_optimization =
|
| 93 |
+
typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
|
| 94 |
+
|
| 95 |
+
template <class TyGroup>
|
| 96 |
+
using disable_tile_optimization =
|
| 97 |
+
typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
|
| 98 |
+
|
| 99 |
+
// Segment for punning to aligned types
|
| 100 |
+
template <unsigned int N>
|
| 101 |
+
struct _Segment {
|
| 102 |
+
int _seg[N];
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
// Trivial layout guaranteed-aligned copy-async compatible segments
|
| 106 |
+
template <unsigned int N>
|
| 107 |
+
struct Segment;
|
| 108 |
+
template <>
|
| 109 |
+
struct __align__(4) Segment<1> : public _Segment<1>{};
|
| 110 |
+
template <>
|
| 111 |
+
struct __align__(8) Segment<2> : public _Segment<2>{};
|
| 112 |
+
template <>
|
| 113 |
+
struct __align__(16) Segment<4> : public _Segment<4>{};
|
| 114 |
+
|
| 115 |
+
// Interleaved element by element copies from source to dest
|
| 116 |
+
template <typename TyGroup, typename TyElem>
|
| 117 |
+
_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
|
| 118 |
+
size_t count) {
|
| 119 |
+
const unsigned int rank = group.thread_rank();
|
| 120 |
+
const unsigned int stride = group.size();
|
| 121 |
+
|
| 122 |
+
for (size_t idx = rank; idx < count; idx += stride) {
|
| 123 |
+
dst[idx] = src[idx];
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
|
| 128 |
+
_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
|
| 129 |
+
const TyElem *__restrict__ src, size_t count) {
|
| 130 |
+
static_assert(async_copy_group_supported<TyGroup>::value,
|
| 131 |
+
"Async copy is only supported for groups that represent private shared memory");
|
| 132 |
+
|
| 133 |
+
if (count == 0) {
|
| 134 |
+
return;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
const bool dstIsNotShared = !__isShared(dst);
|
| 138 |
+
const bool srcIsNotGlobal = !__isGlobal(src);
|
| 139 |
+
|
| 140 |
+
if (dstIsNotShared || srcIsNotGlobal) {
|
| 141 |
+
inline_copy(group, dst, src, count);
|
| 142 |
+
return;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
const unsigned int stride = group.size();
|
| 146 |
+
const unsigned int rank = group.thread_rank();
|
| 147 |
+
// Efficient copies require warps to operate on the same amount of work at each step.
|
| 148 |
+
// remainders are handled in a separate stage to prevent branching
|
| 149 |
+
const unsigned int subWarpMask = (stride - 1);
|
| 150 |
+
const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
|
| 151 |
+
const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
|
| 152 |
+
|
| 153 |
+
const size_t warpCopies = (count & (~subWarpMask));
|
| 154 |
+
|
| 155 |
+
for (size_t idx = 0; idx < warpCopies; idx += stride) {
|
| 156 |
+
size_t _srcIdx = rank + idx;
|
| 157 |
+
size_t _dstIdx = rank + idx;
|
| 158 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
if (subwarpCopies) {
|
| 162 |
+
size_t _srcIdx = warpCopies + maxSubwarpRank;
|
| 163 |
+
size_t _dstIdx = warpCopies + maxSubwarpRank;
|
| 164 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
|
| 169 |
+
_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
|
| 170 |
+
const TyElem *__restrict__ src, size_t count) {
|
| 171 |
+
static_assert(async_copy_group_supported<TyGroup>::value,
|
| 172 |
+
"Async copy is only supported for groups that represent private shared memory");
|
| 173 |
+
|
| 174 |
+
const bool dstIsNotShared = !__isShared(dst);
|
| 175 |
+
const bool srcIsNotGlobal = !__isGlobal(src);
|
| 176 |
+
|
| 177 |
+
if (dstIsNotShared || srcIsNotGlobal) {
|
| 178 |
+
inline_copy(group, dst, src, count);
|
| 179 |
+
return;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
unsigned int stride = group.size();
|
| 183 |
+
unsigned int rank = group.thread_rank();
|
| 184 |
+
|
| 185 |
+
for (size_t idx = rank; idx < count; idx += stride) {
|
| 186 |
+
size_t _srcIdx = idx;
|
| 187 |
+
size_t _dstIdx = idx;
|
| 188 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
// Determine best possible alignment given an input and initial conditions
|
| 193 |
+
// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
|
| 194 |
+
template <unsigned int MinAlignment, unsigned int MaxAlignment>
|
| 195 |
+
_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
|
| 196 |
+
// Narrowing conversion intentional
|
| 197 |
+
uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
|
| 198 |
+
uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
|
| 199 |
+
|
| 200 |
+
uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
|
| 201 |
+
|
| 202 |
+
// range [MaxAlignment, alignof(elem)], step: x >> 1
|
| 203 |
+
// over range of possible alignments, choose best available out of range
|
| 204 |
+
uint32_t out = MaxAlignment;
|
| 205 |
+
#pragma unroll
|
| 206 |
+
for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
|
| 207 |
+
if (alignment & diff)
|
| 208 |
+
out = alignment;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return out;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
// Determine best possible alignment given an input and initial conditions
|
| 215 |
+
// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
|
| 216 |
+
template <typename TyType, typename TyGroup>
|
| 217 |
+
_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 218 |
+
size_t count) {
|
| 219 |
+
const char *src = reinterpret_cast<const char *>(_src);
|
| 220 |
+
char *dst = reinterpret_cast<char *>(_dst);
|
| 221 |
+
|
| 222 |
+
constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
|
| 223 |
+
|
| 224 |
+
uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
|
| 225 |
+
uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
|
| 226 |
+
|
| 227 |
+
inline_copy(group, dst, src, alignOffset);
|
| 228 |
+
count -= alignOffset;
|
| 229 |
+
src += alignOffset;
|
| 230 |
+
dst += alignOffset;
|
| 231 |
+
|
| 232 |
+
// Copy using the best available alignment, async_copy expects n-datums, not bytes
|
| 233 |
+
size_t asyncCount = count / sizeof(TyType);
|
| 234 |
+
accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
|
| 235 |
+
asyncCount *= sizeof(TyType);
|
| 236 |
+
|
| 237 |
+
count -= asyncCount;
|
| 238 |
+
src += asyncCount;
|
| 239 |
+
dst += asyncCount;
|
| 240 |
+
inline_copy(group, dst, src, count);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
// We must determine alignment and manually align src/dst ourselves
|
| 244 |
+
template <size_t AlignHint>
|
| 245 |
+
struct _memcpy_async_align_dispatch {
|
| 246 |
+
template <typename TyGroup>
|
| 247 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
|
| 248 |
+
uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
|
| 249 |
+
|
| 250 |
+
// Avoid copying the extra bytes if desired copy count is smaller
|
| 251 |
+
alignment = count < alignment ? AlignHint : alignment;
|
| 252 |
+
|
| 253 |
+
switch (alignment) {
|
| 254 |
+
default:
|
| 255 |
+
case 1:
|
| 256 |
+
inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
|
| 257 |
+
break;
|
| 258 |
+
case 2:
|
| 259 |
+
inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
|
| 260 |
+
break;
|
| 261 |
+
case 4:
|
| 262 |
+
copy_like<Segment<1>>(group, dst, src, count);
|
| 263 |
+
break;
|
| 264 |
+
case 8:
|
| 265 |
+
copy_like<Segment<2>>(group, dst, src, count);
|
| 266 |
+
break;
|
| 267 |
+
case 16:
|
| 268 |
+
copy_like<Segment<4>>(group, dst, src, count);
|
| 269 |
+
break;
|
| 270 |
+
}
|
| 271 |
+
}
|
| 272 |
+
};
|
| 273 |
+
|
| 274 |
+
// Specialization for 4 byte alignments
|
| 275 |
+
template <>
|
| 276 |
+
struct _memcpy_async_align_dispatch<4> {
|
| 277 |
+
template <typename TyGroup>
|
| 278 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 279 |
+
size_t count) {
|
| 280 |
+
const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
|
| 281 |
+
Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
|
| 282 |
+
|
| 283 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 284 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 285 |
+
}
|
| 286 |
+
};
|
| 287 |
+
|
| 288 |
+
// Specialization for 8 byte alignments
|
| 289 |
+
template <>
|
| 290 |
+
struct _memcpy_async_align_dispatch<8> {
|
| 291 |
+
template <typename TyGroup>
|
| 292 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 293 |
+
size_t count) {
|
| 294 |
+
const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
|
| 295 |
+
Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
|
| 296 |
+
|
| 297 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 298 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 299 |
+
}
|
| 300 |
+
};
|
| 301 |
+
|
| 302 |
+
// Alignments over 16 are truncated to 16 and bypass alignment
|
| 303 |
+
// This is the highest performing memcpy available
|
| 304 |
+
template <>
|
| 305 |
+
struct _memcpy_async_align_dispatch<16> {
|
| 306 |
+
template <typename TyGroup>
|
| 307 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 308 |
+
size_t count) {
|
| 309 |
+
const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
|
| 310 |
+
Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
|
| 311 |
+
|
| 312 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 313 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 314 |
+
}
|
| 315 |
+
};
|
| 316 |
+
|
| 317 |
+
// byte-wide API
|
| 318 |
+
template <size_t Alignment, class TyGroup>
|
| 319 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
|
| 320 |
+
const void *__restrict__ _src, size_t count) {
|
| 321 |
+
static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
|
| 322 |
+
details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
// Internal dispatch APIs
|
| 326 |
+
// These deduce the alignments and sizes necessary to invoke the underlying copy engine
|
| 327 |
+
template <typename Ty>
|
| 328 |
+
using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
|
| 329 |
+
|
| 330 |
+
template <typename Ty>
|
| 331 |
+
using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
|
| 332 |
+
|
| 333 |
+
template <typename Ty>
|
| 334 |
+
using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
|
| 335 |
+
|
| 336 |
+
template <typename Ty>
|
| 337 |
+
using enable_if_integral =
|
| 338 |
+
typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
|
| 339 |
+
|
| 340 |
+
// byte-wide API using aligned_sized_t
|
| 341 |
+
template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
|
| 342 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
|
| 343 |
+
const void *__restrict__ _src, const Alignment<Hint> &count) {
|
| 344 |
+
constexpr size_t _align = (Hint > 16) ? 16 : Hint;
|
| 345 |
+
|
| 346 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
// byte-wide API using type for aligment
|
| 350 |
+
template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
|
| 351 |
+
enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
|
| 352 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
|
| 353 |
+
const TyElem *__restrict__ _src, const TySize& count) {
|
| 354 |
+
constexpr size_t _align = (Hint > 16) ? 16 : Hint;
|
| 355 |
+
|
| 356 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
// byte-wide API with full alignment deduction required
|
| 360 |
+
template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
|
| 361 |
+
enable_if_integral<TySize> = nullptr>
|
| 362 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
|
| 363 |
+
const TyElem *__restrict__ _src, const TySize& count) {
|
| 364 |
+
details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
// 1d-datum API
|
| 368 |
+
template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
|
| 369 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
|
| 370 |
+
const TyElem *__restrict__ src, const size_t srcCount) {
|
| 371 |
+
constexpr unsigned int _align = Hint;
|
| 372 |
+
const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
|
| 373 |
+
|
| 374 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
// 1d-datum API using aligned_size_t
|
| 378 |
+
template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
|
| 379 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
|
| 380 |
+
const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
|
| 381 |
+
constexpr unsigned int _align = Hint;
|
| 382 |
+
const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
|
| 383 |
+
|
| 384 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
} // namespace details
|
| 388 |
+
|
| 389 |
+
/*
|
| 390 |
+
* Group submit batch of async-copy to cover contiguous 1D array
|
| 391 |
+
* and commit that batch to eventually wait for completion.
|
| 392 |
+
*/
|
| 393 |
+
template <class TyGroup, typename TyElem, typename TySizeT>
|
| 394 |
+
_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
|
| 395 |
+
const TySizeT &count) {
|
| 396 |
+
details::_memcpy_async_bytes(group, _dst, _src, count);
|
| 397 |
+
__pipeline_commit();
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
/*
|
| 401 |
+
* Group submit batch of async-copy to cover contiguous 1D array
|
| 402 |
+
* and commit that batch to eventually wait for completion.
|
| 403 |
+
* Object counts are in datum sized chunks, not bytes.
|
| 404 |
+
*/
|
| 405 |
+
template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
|
| 406 |
+
_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
|
| 407 |
+
const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
|
| 408 |
+
details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
|
| 409 |
+
__pipeline_commit();
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
/* Group wait for prior Nth stage of memcpy_async to complete. */
|
| 413 |
+
template <unsigned int Stage, class TyGroup>
|
| 414 |
+
_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
|
| 415 |
+
__pipeline_wait_prior(Stage);
|
| 416 |
+
group.sync();
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
/* Group wait all previously submitted memcpy_async to complete. */
|
| 420 |
+
template <class TyGroup>
|
| 421 |
+
_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
|
| 422 |
+
__pipeline_wait_prior(0);
|
| 423 |
+
group.sync();
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
/***************** CG APIs including pipeline are deprecated *****************/
|
| 427 |
+
|
| 428 |
+
/* Group submit batch of async-copy to cover of contiguous 1D array
|
| 429 |
+
to a pipeline and commit the batch*/
|
| 430 |
+
template <class TyGroup, class TyElem>
|
| 431 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
|
| 432 |
+
nvcuda::experimental::pipeline &pipe) {
|
| 433 |
+
details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
|
| 434 |
+
pipe.commit();
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
/* Group wait for prior Nth stage of memcpy_async to complete. */
|
| 438 |
+
template <unsigned int Stage, class TyGroup>
|
| 439 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
|
| 440 |
+
pipe.wait_prior<Stage>();
|
| 441 |
+
group.sync();
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
/* Group wait for stage-S of memcpy_async to complete. */
|
| 445 |
+
template <class TyGroup>
|
| 446 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
|
| 447 |
+
pipe.wait(stage);
|
| 448 |
+
group.sync();
|
| 449 |
+
}
|
| 450 |
+
_CG_END_NAMESPACE
|
| 451 |
+
|
| 452 |
+
#endif // _CG_ASYNC_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_COALESCED_REDUCE_H_
|
| 50 |
+
#define _CG_COALESCED_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "cooperative_groups.h"
|
| 55 |
+
#include "partitioning.h"
|
| 56 |
+
#include "coalesced_scan.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
|
| 63 |
+
_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
|
| 64 |
+
TyVal&& val,
|
| 65 |
+
TyOp&& op) -> decltype(op(val, val)) {
|
| 66 |
+
auto out = val;
|
| 67 |
+
for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
|
| 68 |
+
out = op(out, group.shfl_xor(out, mask));
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
return out;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
template <typename TyVal, typename TyOp>
|
| 75 |
+
_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 76 |
+
if (group.size() == 32) {
|
| 77 |
+
// Full coalesced group can go through faster path by being treated as a tile of size 32
|
| 78 |
+
auto tile = details::tiled_partition_internal<32, void>();
|
| 79 |
+
return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 80 |
+
}
|
| 81 |
+
else {
|
| 82 |
+
auto scan_result =
|
| 83 |
+
inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 84 |
+
unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
|
| 85 |
+
unsigned int last_thread_id = 31 - __clz(group_mask);
|
| 86 |
+
return details::tile::shuffle_dispatch<TyVal>::shfl(
|
| 87 |
+
_CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
} // details
|
| 92 |
+
|
| 93 |
+
_CG_END_NAMESPACE
|
| 94 |
+
|
| 95 |
+
#endif // _CG_COALESCED_REDUCE_H_
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_COALESCED_SCAN_H_
|
| 50 |
+
#define _CG_COALESCED_SCAN_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "cooperative_groups.h"
|
| 55 |
+
#include "partitioning.h"
|
| 56 |
+
#include "functional.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename TyGroup, typename TyVal, typename TyOp>
|
| 63 |
+
_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 64 |
+
auto out = val;
|
| 65 |
+
for (int mask = 1; mask < group.size(); mask <<= 1) {
|
| 66 |
+
auto tmp = group.shfl_up(out, mask);
|
| 67 |
+
if (mask <= group.thread_rank()) {
|
| 68 |
+
out = op(out, tmp);
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return out;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
template <typename TyGroup, typename TyVal, typename TyOp>
|
| 76 |
+
_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 77 |
+
const unsigned int groupSize = group.size();
|
| 78 |
+
auto out = val;
|
| 79 |
+
|
| 80 |
+
const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
|
| 81 |
+
unsigned int lanemask = details::lanemask32_lt() & mask;
|
| 82 |
+
unsigned int srcLane = details::laneid();
|
| 83 |
+
|
| 84 |
+
const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
|
| 85 |
+
const unsigned int rank = __popc(lanemask);
|
| 86 |
+
|
| 87 |
+
for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
|
| 88 |
+
if (i <= rank) {
|
| 89 |
+
srcLane -= j;
|
| 90 |
+
j = i; /* maximum possible lane */
|
| 91 |
+
|
| 92 |
+
unsigned int begLane = base + rank - i; /* minimum possible lane */
|
| 93 |
+
|
| 94 |
+
/* Next source lane is in the range [ begLane .. srcLane ]
|
| 95 |
+
* If begLane < srcLane then do a binary search.
|
| 96 |
+
*/
|
| 97 |
+
while (begLane < srcLane) {
|
| 98 |
+
const unsigned int halfLane = (begLane + srcLane) >> 1;
|
| 99 |
+
const unsigned int halfMask = lanemask >> halfLane;
|
| 100 |
+
const unsigned int d = __popc(halfMask);
|
| 101 |
+
if (d < i) {
|
| 102 |
+
srcLane = halfLane - 1; /* halfLane too large */
|
| 103 |
+
}
|
| 104 |
+
else if ((i < d) || !(halfMask & 0x01)) {
|
| 105 |
+
begLane = halfLane + 1; /* halfLane too small */
|
| 106 |
+
}
|
| 107 |
+
else {
|
| 108 |
+
begLane = srcLane = halfLane; /* happen to hit */
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
|
| 114 |
+
if (i <= rank) {
|
| 115 |
+
out = op(out, tmp);
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
return out;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
|
| 122 |
+
_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
|
| 123 |
+
TyVal&& val,
|
| 124 |
+
TyOp&& op) -> decltype(op(val, val)) {
|
| 125 |
+
return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
template <typename TyVal, typename TyOp>
|
| 129 |
+
_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 130 |
+
if (group.size() == 32) {
|
| 131 |
+
return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 132 |
+
}
|
| 133 |
+
else {
|
| 134 |
+
return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
template <bool IntegralOptimized>
|
| 139 |
+
struct scan_choose_convertion;
|
| 140 |
+
|
| 141 |
+
template<>
|
| 142 |
+
struct scan_choose_convertion<true> {
|
| 143 |
+
template <typename TyGroup, typename TyRes, typename TyVal>
|
| 144 |
+
_CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
|
| 145 |
+
return result - val;
|
| 146 |
+
}
|
| 147 |
+
};
|
| 148 |
+
|
| 149 |
+
template<>
|
| 150 |
+
struct scan_choose_convertion<false> {
|
| 151 |
+
template <typename TyGroup, typename TyRes, typename TyVal>
|
| 152 |
+
_CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
|
| 153 |
+
auto ret = group.shfl_up(result, 1);
|
| 154 |
+
if (group.thread_rank() == 0) {
|
| 155 |
+
return {};
|
| 156 |
+
}
|
| 157 |
+
else {
|
| 158 |
+
return ret;
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
|
| 164 |
+
_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 165 |
+
using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
|
| 166 |
+
&& _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
|
| 167 |
+
return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
} // details
|
| 171 |
+
|
| 172 |
+
_CG_END_NAMESPACE
|
| 173 |
+
|
| 174 |
+
#endif // _CG_COALESCED_SCAN_H_
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_DRIVER_API_H
|
| 50 |
+
#define _CG_DRIVER_API_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
|
| 54 |
+
_CG_BEGIN_NAMESPACE
|
| 55 |
+
|
| 56 |
+
namespace details {
|
| 57 |
+
template <unsigned int RegId>
|
| 58 |
+
_CG_QUALIFIER unsigned int load_env_reg() {
|
| 59 |
+
// Abort by default
|
| 60 |
+
_CG_ABORT();
|
| 61 |
+
return 0;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
template <unsigned int HiReg, unsigned int LoReg>
|
| 65 |
+
_CG_QUALIFIER unsigned long long load_env_reg64() {
|
| 66 |
+
unsigned long long registerLo = load_env_reg<LoReg>();
|
| 67 |
+
unsigned long long registerHi = load_env_reg<HiReg>();
|
| 68 |
+
|
| 69 |
+
return (registerHi << 32) | registerLo;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
// inline PTX for accessing registers requires an immediate for the special reg
|
| 73 |
+
# define LOAD_ENVREG(NUMBER) \
|
| 74 |
+
template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
|
| 75 |
+
unsigned int r; \
|
| 76 |
+
asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
|
| 77 |
+
return r; \
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// Instantiate loaders for registers used
|
| 81 |
+
LOAD_ENVREG(0);
|
| 82 |
+
LOAD_ENVREG(1);
|
| 83 |
+
LOAD_ENVREG(2);
|
| 84 |
+
# undef LOAD_ENVREG
|
| 85 |
+
|
| 86 |
+
struct grid_workspace {
|
| 87 |
+
unsigned int wsSize;
|
| 88 |
+
unsigned int barrier;
|
| 89 |
+
};
|
| 90 |
+
|
| 91 |
+
_CG_QUALIFIER grid_workspace* get_grid_workspace() {
|
| 92 |
+
unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
|
| 93 |
+
// Interpret the address from envreg 1 and 2 as the driver's grid workspace
|
| 94 |
+
return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
_CG_END_NAMESPACE
|
| 98 |
+
|
| 99 |
+
#endif // _CG_DRIVER_API_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_FUNCTIONAL_H
|
| 50 |
+
#define _CG_FUNCTIONAL_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
#ifdef _CG_USE_CUDA_STL
|
| 57 |
+
# include <cuda/std/functional>
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
_CG_BEGIN_NAMESPACE
|
| 61 |
+
|
| 62 |
+
namespace details {
|
| 63 |
+
#ifdef _CG_USE_CUDA_STL
|
| 64 |
+
using cuda::std::plus;
|
| 65 |
+
using cuda::std::bit_and;
|
| 66 |
+
using cuda::std::bit_xor;
|
| 67 |
+
using cuda::std::bit_or;
|
| 68 |
+
#else
|
| 69 |
+
template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
|
| 70 |
+
template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
|
| 71 |
+
template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
|
| 72 |
+
template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
|
| 73 |
+
#endif // _CG_USE_PLATFORM_STL
|
| 74 |
+
} // details
|
| 75 |
+
|
| 76 |
+
template <typename Ty>
|
| 77 |
+
struct plus : public details::plus<Ty> {};
|
| 78 |
+
|
| 79 |
+
template <typename Ty>
|
| 80 |
+
struct less {
|
| 81 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 82 |
+
return (arg2 < arg1) ? arg2 : arg1;
|
| 83 |
+
}
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
template <typename Ty>
|
| 87 |
+
struct greater {
|
| 88 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 89 |
+
return (arg1 < arg2) ? arg2 : arg1;
|
| 90 |
+
}
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
template <typename Ty>
|
| 94 |
+
struct bit_and : public details::bit_and<Ty> {};
|
| 95 |
+
|
| 96 |
+
template <typename Ty>
|
| 97 |
+
struct bit_xor : public details::bit_xor<Ty> {};
|
| 98 |
+
|
| 99 |
+
template <typename Ty>
|
| 100 |
+
struct bit_or : public details::bit_or<Ty> {};
|
| 101 |
+
|
| 102 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 103 |
+
namespace details {
|
| 104 |
+
template <class Ty>
|
| 105 |
+
using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 106 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
|
| 107 |
+
|
| 108 |
+
template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 109 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 110 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 111 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 112 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 113 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 114 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 115 |
+
|
| 116 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 117 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
|
| 118 |
+
auto old = atomic.load(cuda::std::memory_order_relaxed);
|
| 119 |
+
while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
|
| 120 |
+
return old;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
template<typename TyOp>
|
| 124 |
+
struct op_picker;
|
| 125 |
+
|
| 126 |
+
template<typename TyVal>
|
| 127 |
+
struct op_picker<cooperative_groups::plus<TyVal>> {
|
| 128 |
+
template<typename TyAtomic>
|
| 129 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 130 |
+
return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
|
| 131 |
+
}
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
template<typename TyVal>
|
| 135 |
+
struct op_picker<cooperative_groups::less<TyVal>> {
|
| 136 |
+
template<typename TyAtomic>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 138 |
+
return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
template<typename TyVal>
|
| 143 |
+
struct op_picker<cooperative_groups::greater<TyVal>> {
|
| 144 |
+
template<typename TyAtomic>
|
| 145 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 146 |
+
return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
|
| 147 |
+
}
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
template<typename TyVal>
|
| 151 |
+
struct op_picker<cooperative_groups::bit_and<TyVal>> {
|
| 152 |
+
template<typename TyAtomic>
|
| 153 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 154 |
+
return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
|
| 155 |
+
}
|
| 156 |
+
};
|
| 157 |
+
|
| 158 |
+
template<typename TyVal>
|
| 159 |
+
struct op_picker<cooperative_groups::bit_xor<TyVal>> {
|
| 160 |
+
template<typename TyAtomic>
|
| 161 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 162 |
+
return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
|
| 163 |
+
}
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
template<typename TyVal>
|
| 167 |
+
struct op_picker<cooperative_groups::bit_or<TyVal>> {
|
| 168 |
+
template<typename TyAtomic>
|
| 169 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 170 |
+
return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
|
| 171 |
+
}
|
| 172 |
+
};
|
| 173 |
+
|
| 174 |
+
template<bool atomic_supported>
|
| 175 |
+
struct atomic_update_dispatch {};
|
| 176 |
+
|
| 177 |
+
template<>
|
| 178 |
+
struct atomic_update_dispatch<false> {
|
| 179 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 180 |
+
_CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 181 |
+
return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 182 |
+
}
|
| 183 |
+
};
|
| 184 |
+
|
| 185 |
+
template<>
|
| 186 |
+
struct atomic_update_dispatch<true> {
|
| 187 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 188 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
|
| 189 |
+
using dispatch = op_picker<details::remove_qual<TyOp>>;
|
| 190 |
+
|
| 191 |
+
return dispatch::atomic_update(atomic, val);
|
| 192 |
+
}
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 196 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 197 |
+
using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
|
| 198 |
+
|
| 199 |
+
return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
template<typename TyAtomic, typename TyVal>
|
| 203 |
+
_CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
|
| 204 |
+
atomic.store(val, cuda::std::memory_order_relaxed);
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
#endif
|
| 208 |
+
|
| 209 |
+
_CG_END_NAMESPACE
|
| 210 |
+
|
| 211 |
+
#endif
|
| 212 |
+
#endif //_CG_FUNCTIONAL_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_HELPERS_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "sync.h"
|
| 54 |
+
|
| 55 |
+
_CG_BEGIN_NAMESPACE
|
| 56 |
+
|
| 57 |
+
namespace details {
|
| 58 |
+
#ifdef _CG_CPP11_FEATURES
|
| 59 |
+
template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
|
| 60 |
+
# ifdef _CG_HAS_FP16_COLLECTIVE
|
| 61 |
+
template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
|
| 62 |
+
template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
|
| 63 |
+
# endif
|
| 64 |
+
template <typename Ty>
|
| 65 |
+
using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
|
| 66 |
+
|
| 67 |
+
// Non-STL utility templates
|
| 68 |
+
template <typename Ty>
|
| 69 |
+
using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
|
| 70 |
+
|
| 71 |
+
template <typename TyLhs, typename TyRhs>
|
| 72 |
+
using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
|
| 73 |
+
>;
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
template <typename TyTrunc>
|
| 77 |
+
_CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
|
| 78 |
+
return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
|
| 79 |
+
((TyTrunc)index.y * nIndex.x) +
|
| 80 |
+
(TyTrunc)index.x;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
namespace cta {
|
| 84 |
+
|
| 85 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 86 |
+
{
|
| 87 |
+
__barrier_sync(0);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 91 |
+
{
|
| 92 |
+
return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 96 |
+
{
|
| 97 |
+
return vec3_to_linear<unsigned int>(threadIdx, blockDim);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
_CG_STATIC_QUALIFIER dim3 group_index()
|
| 101 |
+
{
|
| 102 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 106 |
+
{
|
| 107 |
+
return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 111 |
+
{
|
| 112 |
+
return dim3(blockDim.x, blockDim.y, blockDim.z);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Legacy aliases
|
| 116 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 117 |
+
{
|
| 118 |
+
return num_threads();
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
_CG_STATIC_QUALIFIER dim3 block_dim()
|
| 122 |
+
{
|
| 123 |
+
return dim_threads();
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
class _coalesced_group_data_access {
|
| 129 |
+
public:
|
| 130 |
+
// Retrieve mask of coalesced groups and tiles
|
| 131 |
+
template <typename TyGroup>
|
| 132 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
|
| 133 |
+
return group.get_mask();
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
template <typename TyGroup>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
|
| 138 |
+
return TyGroup(mask);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
template <typename TyGroup>
|
| 142 |
+
_CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
|
| 143 |
+
group._data.coalesced.metaGroupRank = mgRank;
|
| 144 |
+
group._data.coalesced.metaGroupSize = mgSize;
|
| 145 |
+
}
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
namespace tile {
|
| 149 |
+
template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
|
| 150 |
+
struct _tile_helpers{
|
| 151 |
+
_CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
|
| 152 |
+
_CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
|
| 153 |
+
_CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
|
| 154 |
+
_CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
|
| 155 |
+
};
|
| 156 |
+
|
| 157 |
+
template <unsigned int> struct tile_helpers;
|
| 158 |
+
template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
|
| 159 |
+
template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
|
| 160 |
+
template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
|
| 161 |
+
template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
|
| 162 |
+
template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
|
| 163 |
+
template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
|
| 164 |
+
|
| 165 |
+
#ifdef _CG_CPP11_FEATURES
|
| 166 |
+
namespace shfl {
|
| 167 |
+
/***********************************************************************************
|
| 168 |
+
* Recursively Sliced Shuffle
|
| 169 |
+
* Purpose:
|
| 170 |
+
* Slices an input type a number of times into integral types so that shuffles
|
| 171 |
+
* are well defined
|
| 172 |
+
* Expectations:
|
| 173 |
+
* This object *should not* be used from a reinterpret_cast pointer unless
|
| 174 |
+
* some alignment guarantees can be met. Use a memcpy to guarantee that loads
|
| 175 |
+
* from the integral types stored within are aligned and correct.
|
| 176 |
+
**********************************************************************************/
|
| 177 |
+
template <unsigned int count, bool intSized = (count <= sizeof(int))>
|
| 178 |
+
struct recursive_sliced_shuffle_helper;
|
| 179 |
+
|
| 180 |
+
template <unsigned int count>
|
| 181 |
+
struct recursive_sliced_shuffle_helper<count, true> {
|
| 182 |
+
int val;
|
| 183 |
+
|
| 184 |
+
template <typename TyFn>
|
| 185 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 186 |
+
val = shfl(val);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
template <unsigned int count>
|
| 191 |
+
struct recursive_sliced_shuffle_helper<count, false> {
|
| 192 |
+
int val;
|
| 193 |
+
recursive_sliced_shuffle_helper<count - sizeof(int)> next;
|
| 194 |
+
|
| 195 |
+
template <typename TyFn>
|
| 196 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 197 |
+
val = shfl(val);
|
| 198 |
+
next.invoke_shuffle(shfl);
|
| 199 |
+
}
|
| 200 |
+
};
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
struct _memory_shuffle {
|
| 204 |
+
template <typename TyElem, typename TyShflFn>
|
| 205 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 206 |
+
static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
|
| 207 |
+
return TyElem{};
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 211 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 212 |
+
auto shfl = [=](int val) -> int {
|
| 213 |
+
return 0;
|
| 214 |
+
};
|
| 215 |
+
|
| 216 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 220 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 221 |
+
auto shfl = [=](int val) -> int {
|
| 222 |
+
return 0;
|
| 223 |
+
};
|
| 224 |
+
|
| 225 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 229 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 230 |
+
auto shfl = [=](int val) -> int {
|
| 231 |
+
return 0;
|
| 232 |
+
};
|
| 233 |
+
|
| 234 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 238 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 239 |
+
auto shfl = [=](int val) -> int {
|
| 240 |
+
return 0;
|
| 241 |
+
};
|
| 242 |
+
|
| 243 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 244 |
+
}
|
| 245 |
+
};
|
| 246 |
+
|
| 247 |
+
/***********************************************************************************
|
| 248 |
+
* Intrinsic Device Function Shuffle
|
| 249 |
+
* Purpose:
|
| 250 |
+
* Uses a shuffle helper that has characteristics best suited for moving
|
| 251 |
+
* elements between threads
|
| 252 |
+
* Expectations:
|
| 253 |
+
* Object given will be forced into an l-value type so that it can be used
|
| 254 |
+
* with a helper structure that reinterprets the data into intrinsic compatible
|
| 255 |
+
* types
|
| 256 |
+
* Notes:
|
| 257 |
+
* !! TyRet is required so that objects are returned by value and not as
|
| 258 |
+
* dangling references depending on the value category of the passed object
|
| 259 |
+
**********************************************************************************/
|
| 260 |
+
struct _intrinsic_compat_shuffle {
|
| 261 |
+
template <unsigned int count>
|
| 262 |
+
using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
|
| 263 |
+
|
| 264 |
+
template <typename TyElem, typename TyShflFn>
|
| 265 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 266 |
+
static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
|
| 267 |
+
shfl_helper<sizeof(TyElem)> helper;
|
| 268 |
+
memcpy(&helper, &elem, sizeof(TyElem));
|
| 269 |
+
helper.invoke_shuffle(fn);
|
| 270 |
+
memcpy(&elem, &helper, sizeof(TyElem));
|
| 271 |
+
return elem;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 275 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 276 |
+
auto shfl = [=](int val) -> int {
|
| 277 |
+
return __shfl_sync(gMask, val, srcRank, threads);
|
| 278 |
+
};
|
| 279 |
+
|
| 280 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 284 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 285 |
+
auto shfl = [=](int val) -> int {
|
| 286 |
+
return __shfl_down_sync(gMask, val, delta, threads);
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 293 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 294 |
+
auto shfl = [=](int val) -> int {
|
| 295 |
+
return __shfl_up_sync(gMask, val, delta, threads);
|
| 296 |
+
};
|
| 297 |
+
|
| 298 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 302 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 303 |
+
auto shfl = [=](int val) -> int {
|
| 304 |
+
return __shfl_xor_sync(gMask, val, lMask, threads);
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 308 |
+
}
|
| 309 |
+
};
|
| 310 |
+
|
| 311 |
+
struct _native_shuffle {
|
| 312 |
+
template <typename TyElem>
|
| 313 |
+
_CG_STATIC_QUALIFIER TyElem shfl(
|
| 314 |
+
TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 315 |
+
return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
template <typename TyElem>
|
| 319 |
+
_CG_STATIC_QUALIFIER TyElem shfl_down(
|
| 320 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 321 |
+
return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
template <typename TyElem>
|
| 325 |
+
_CG_STATIC_QUALIFIER TyElem shfl_up(
|
| 326 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 327 |
+
return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template <typename TyElem>
|
| 331 |
+
_CG_STATIC_QUALIFIER TyElem shfl_xor(
|
| 332 |
+
TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 333 |
+
return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
|
| 334 |
+
}
|
| 335 |
+
};
|
| 336 |
+
|
| 337 |
+
// Almost all arithmetic types are supported by native shuffle
|
| 338 |
+
// Vector types are the exception
|
| 339 |
+
template <typename TyElem>
|
| 340 |
+
using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
|
| 341 |
+
bool,
|
| 342 |
+
_CG_STL_NAMESPACE::is_integral<
|
| 343 |
+
remove_qual<TyElem>>::value ||
|
| 344 |
+
details::is_float_or_half<
|
| 345 |
+
remove_qual<TyElem>>::value
|
| 346 |
+
>;
|
| 347 |
+
|
| 348 |
+
constexpr unsigned long long _MemoryShuffleCutoff = 32;
|
| 349 |
+
|
| 350 |
+
template <typename TyElem,
|
| 351 |
+
bool IsNative = use_native_shuffle<TyElem>::value,
|
| 352 |
+
bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
|
| 353 |
+
struct shuffle_dispatch;
|
| 354 |
+
|
| 355 |
+
template <typename TyElem>
|
| 356 |
+
struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
|
| 357 |
+
|
| 358 |
+
template <typename TyElem>
|
| 359 |
+
struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
|
| 360 |
+
|
| 361 |
+
template <typename TyElem>
|
| 362 |
+
struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
|
| 363 |
+
|
| 364 |
+
#endif //_CG_CPP11_FEATURES
|
| 365 |
+
};
|
| 366 |
+
|
| 367 |
+
namespace multi_grid {
|
| 368 |
+
struct multi_grid_functions;
|
| 369 |
+
};
|
| 370 |
+
|
| 371 |
+
namespace grid {
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
|
| 373 |
+
return details::sync_grids_arrive(bar);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
|
| 377 |
+
details::sync_grids_wait(token, bar);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
|
| 381 |
+
unsigned int token = details::sync_grids_arrive(bar);
|
| 382 |
+
details::sync_grids_wait(token, bar);
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks()
|
| 386 |
+
{
|
| 387 |
+
// grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
|
| 388 |
+
// grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
|
| 389 |
+
return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads()
|
| 393 |
+
{
|
| 394 |
+
return num_blocks() * cta::num_threads();
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank()
|
| 398 |
+
{
|
| 399 |
+
return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank()
|
| 403 |
+
{
|
| 404 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 408 |
+
{
|
| 409 |
+
return dim3(gridDim.x, gridDim.y, gridDim.z);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 413 |
+
{
|
| 414 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 418 |
+
{
|
| 419 |
+
return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 423 |
+
{
|
| 424 |
+
return dim3(blockIdx.x * blockDim.x + threadIdx.x,
|
| 425 |
+
blockIdx.y * blockDim.y + threadIdx.y,
|
| 426 |
+
blockIdx.z * blockDim.z + threadIdx.z);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 431 |
+
return __clusterGridDimInClusters();
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 435 |
+
const dim3 dimClusters = dim_clusters();
|
| 436 |
+
return dimClusters.x * dimClusters.y * dimClusters.z;
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 440 |
+
return __clusterIdx();
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 444 |
+
return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
|
| 445 |
+
}
|
| 446 |
+
#endif
|
| 447 |
+
|
| 448 |
+
// Legacy aliases
|
| 449 |
+
_CG_STATIC_QUALIFIER unsigned long long size()
|
| 450 |
+
{
|
| 451 |
+
return num_threads();
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
_CG_STATIC_QUALIFIER dim3 grid_dim()
|
| 455 |
+
{
|
| 456 |
+
return dim_blocks();
|
| 457 |
+
}
|
| 458 |
+
};
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 462 |
+
|
| 463 |
+
namespace multi_grid {
|
| 464 |
+
_CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
|
| 465 |
+
{
|
| 466 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 467 |
+
//this function is defined in device runtime library
|
| 468 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 469 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 470 |
+
return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
|
| 471 |
+
#else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
|
| 472 |
+
return 0;
|
| 473 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
_CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
|
| 477 |
+
{
|
| 478 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 479 |
+
//this function is defined in device runtime library
|
| 480 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 481 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 482 |
+
cudaError_t err = cudaCGSynchronize(handle, 0);
|
| 483 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
_CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
|
| 487 |
+
{
|
| 488 |
+
unsigned int numThreads = 0;
|
| 489 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 490 |
+
//this function is defined in device runtime library
|
| 491 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 492 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 493 |
+
cudaCGGetSize(&numThreads, NULL, handle);
|
| 494 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 495 |
+
return numThreads;
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
|
| 499 |
+
{
|
| 500 |
+
unsigned int threadRank = 0;
|
| 501 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 502 |
+
//this function is defined in device runtime library
|
| 503 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 504 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 505 |
+
cudaCGGetRank(&threadRank, NULL, handle);
|
| 506 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 507 |
+
return threadRank;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
_CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
|
| 511 |
+
{
|
| 512 |
+
unsigned int gridRank = 0;
|
| 513 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 514 |
+
//this function is defined in device runtime library
|
| 515 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 516 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 517 |
+
cudaCGGetRank(NULL, &gridRank, handle);
|
| 518 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 519 |
+
return gridRank;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
_CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
|
| 523 |
+
{
|
| 524 |
+
unsigned int numGrids = 0;
|
| 525 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 526 |
+
//this function is defined in device runtime library
|
| 527 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 528 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 529 |
+
cudaCGGetSize(NULL, &numGrids, handle);
|
| 530 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 531 |
+
return numGrids;
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
# ifdef _CG_CPP11_FEATURES
|
| 535 |
+
struct multi_grid_functions {
|
| 536 |
+
decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
|
| 537 |
+
decltype(multi_grid::sync) *sync;
|
| 538 |
+
decltype(multi_grid::size) *size;
|
| 539 |
+
decltype(multi_grid::thread_rank) *thread_rank;
|
| 540 |
+
decltype(multi_grid::grid_rank) *grid_rank;
|
| 541 |
+
decltype(multi_grid::num_grids) *num_grids;
|
| 542 |
+
};
|
| 543 |
+
|
| 544 |
+
template <typename = void>
|
| 545 |
+
_CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
|
| 546 |
+
__constant__ static const multi_grid_functions mgf {
|
| 547 |
+
&multi_grid::get_intrinsic_handle,
|
| 548 |
+
&multi_grid::sync,
|
| 549 |
+
&multi_grid::size,
|
| 550 |
+
&multi_grid::thread_rank,
|
| 551 |
+
&multi_grid::grid_rank,
|
| 552 |
+
&multi_grid::num_grids
|
| 553 |
+
};
|
| 554 |
+
|
| 555 |
+
return &mgf;
|
| 556 |
+
}
|
| 557 |
+
# endif
|
| 558 |
+
};
|
| 559 |
+
#endif
|
| 560 |
+
|
| 561 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 562 |
+
namespace cluster {
|
| 563 |
+
|
| 564 |
+
_CG_STATIC_QUALIFIER bool isReal()
|
| 565 |
+
{
|
| 566 |
+
return __clusterDimIsSpecified();
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
_CG_STATIC_QUALIFIER void barrier_arrive()
|
| 570 |
+
{
|
| 571 |
+
__cluster_barrier_arrive();
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 575 |
+
{
|
| 576 |
+
__cluster_barrier_wait();
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 580 |
+
{
|
| 581 |
+
barrier_arrive();
|
| 582 |
+
barrier_wait();
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 586 |
+
{
|
| 587 |
+
return __cluster_query_shared_rank(addr);
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
template <typename T>
|
| 591 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 592 |
+
{
|
| 593 |
+
return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 597 |
+
{
|
| 598 |
+
return __clusterRelativeBlockIdx();
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 602 |
+
{
|
| 603 |
+
return __clusterRelativeBlockRank();
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 607 |
+
{
|
| 608 |
+
const dim3 blockIndex = block_index();
|
| 609 |
+
return dim3(blockIndex.x * blockDim.x + threadIdx.x,
|
| 610 |
+
blockIndex.y * blockDim.y + threadIdx.y,
|
| 611 |
+
blockIndex.z * blockDim.z + threadIdx.z);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 615 |
+
{
|
| 616 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 620 |
+
{
|
| 621 |
+
return __clusterDim();
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 625 |
+
{
|
| 626 |
+
return __clusterSizeInBlocks();
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 630 |
+
{
|
| 631 |
+
const dim3 dimBlocks = dim_blocks();
|
| 632 |
+
const unsigned int x = dimBlocks.x * blockDim.x;
|
| 633 |
+
const unsigned int y = dimBlocks.y * blockDim.y;
|
| 634 |
+
const unsigned int z = dimBlocks.z * blockDim.z;
|
| 635 |
+
return dim3(x, y, z);
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 639 |
+
{
|
| 640 |
+
return num_blocks() * cta::num_threads();
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
};
|
| 644 |
+
#endif
|
| 645 |
+
|
| 646 |
+
_CG_STATIC_QUALIFIER unsigned int laneid()
|
| 647 |
+
{
|
| 648 |
+
unsigned int laneid;
|
| 649 |
+
asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
|
| 650 |
+
return laneid;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
|
| 654 |
+
{
|
| 655 |
+
unsigned int lanemask32_eq;
|
| 656 |
+
asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
|
| 657 |
+
return (lanemask32_eq);
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
|
| 661 |
+
{
|
| 662 |
+
unsigned int lanemask32_lt;
|
| 663 |
+
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
|
| 664 |
+
return (lanemask32_lt);
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
_CG_STATIC_QUALIFIER void abort()
|
| 668 |
+
{
|
| 669 |
+
_CG_ABORT();
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
template <typename Ty>
|
| 673 |
+
_CG_QUALIFIER void assert_if_not_arithmetic() {
|
| 674 |
+
#ifdef _CG_CPP11_FEATURES
|
| 675 |
+
static_assert(
|
| 676 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value ||
|
| 677 |
+
details::is_float_or_half<Ty>::value,
|
| 678 |
+
"Error: Ty is neither integer or float"
|
| 679 |
+
);
|
| 680 |
+
#endif //_CG_CPP11_FEATURES
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
#ifdef _CG_CPP11_FEATURES
|
| 684 |
+
_CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
|
| 685 |
+
return x == 1 ? 0 : 1 + log2(x / 2);
|
| 686 |
+
}
|
| 687 |
+
#endif //_CG_CPP11_FEATURES
|
| 688 |
+
|
| 689 |
+
}; // !Namespace internal
|
| 690 |
+
|
| 691 |
+
_CG_END_NAMESPACE
|
| 692 |
+
|
| 693 |
+
#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
#include <nv/target>
|
| 51 |
+
|
| 52 |
+
#ifndef _CG_INFO_H_
|
| 53 |
+
#define _CG_INFO_H_
|
| 54 |
+
/*
|
| 55 |
+
** Define: _CG_VERSION
|
| 56 |
+
*/
|
| 57 |
+
#define _CG_VERSION 1000
|
| 58 |
+
|
| 59 |
+
/*
|
| 60 |
+
** Define: _CG_ABI_VERSION
|
| 61 |
+
*/
|
| 62 |
+
#ifndef _CG_ABI_VERSION
|
| 63 |
+
# define _CG_ABI_VERSION 1
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
/*
|
| 67 |
+
** Define: _CG_ABI_EXPERIMENTAL
|
| 68 |
+
** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
|
| 69 |
+
*/
|
| 70 |
+
#if defined(_CG_ABI_EXPERIMENTAL)
|
| 71 |
+
#endif
|
| 72 |
+
|
| 73 |
+
#define _CG_CONCAT_INNER(x, y) x ## y
|
| 74 |
+
#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
|
| 75 |
+
#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
|
| 76 |
+
|
| 77 |
+
#define _CG_BEGIN_NAMESPACE \
|
| 78 |
+
namespace cooperative_groups { namespace _CG_NAMESPACE {
|
| 79 |
+
#define _CG_END_NAMESPACE \
|
| 80 |
+
}; using namespace _CG_NAMESPACE; };
|
| 81 |
+
|
| 82 |
+
#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
|
| 83 |
+
# define _CG_CPP11_FEATURES
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
#if !defined(_CG_QUALIFIER)
|
| 87 |
+
# define _CG_QUALIFIER __forceinline__ __device__
|
| 88 |
+
#endif
|
| 89 |
+
#if !defined(_CG_STATIC_QUALIFIER)
|
| 90 |
+
# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
|
| 91 |
+
#endif
|
| 92 |
+
#if !defined(_CG_CONSTEXPR_QUALIFIER)
|
| 93 |
+
# if defined(_CG_CPP11_FEATURES)
|
| 94 |
+
# define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
|
| 95 |
+
# else
|
| 96 |
+
# define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
|
| 97 |
+
# endif
|
| 98 |
+
#endif
|
| 99 |
+
#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
|
| 100 |
+
# if defined(_CG_CPP11_FEATURES)
|
| 101 |
+
# define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
|
| 102 |
+
# else
|
| 103 |
+
# define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
|
| 104 |
+
# endif
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
#if defined(_MSC_VER)
|
| 108 |
+
# define _CG_DEPRECATED __declspec(deprecated)
|
| 109 |
+
#else
|
| 110 |
+
# define _CG_DEPRECATED __attribute__((deprecated))
|
| 111 |
+
#endif
|
| 112 |
+
|
| 113 |
+
#if defined(__CUDA_MINIMUM_ARCH__)
|
| 114 |
+
# define _CG_CUDA_ARCH __CUDA_MINIMUM_ARCH__
|
| 115 |
+
#elif defined(__CUDA_ARCH__)
|
| 116 |
+
# define _CG_CUDA_ARCH __CUDA_ARCH__
|
| 117 |
+
#endif
|
| 118 |
+
|
| 119 |
+
#if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
|
| 120 |
+
# define _CG_HAS_GRID_GROUP
|
| 121 |
+
#endif
|
| 122 |
+
#if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
|
| 123 |
+
# define _CG_HAS_MULTI_GRID_GROUP
|
| 124 |
+
#endif
|
| 125 |
+
#if (_CG_CUDA_ARCH >= 700) || !defined(_CG_CUDA_ARCH)
|
| 126 |
+
# define _CG_HAS_MATCH_COLLECTIVE
|
| 127 |
+
#endif
|
| 128 |
+
|
| 129 |
+
#if ((_CG_CUDA_ARCH >= 800) || !defined(_CG_CUDA_ARCH)) && !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
|
| 130 |
+
# define _CG_HAS_RESERVED_SHARED
|
| 131 |
+
#endif
|
| 132 |
+
|
| 133 |
+
#if ((_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)) && \
|
| 134 |
+
(defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) && \
|
| 135 |
+
defined(_CG_CPP11_FEATURES)
|
| 136 |
+
# define _CG_HAS_CLUSTER_GROUP
|
| 137 |
+
#endif
|
| 138 |
+
|
| 139 |
+
#if (_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)
|
| 140 |
+
# define _CG_HAS_INSTR_ELECT
|
| 141 |
+
#endif
|
| 142 |
+
|
| 143 |
+
// Has __half and __half2
|
| 144 |
+
// Only usable if you include the cuda_fp16.h extension, and
|
| 145 |
+
// _before_ including cooperative_groups.h
|
| 146 |
+
#ifdef __CUDA_FP16_TYPES_EXIST__
|
| 147 |
+
# define _CG_HAS_FP16_COLLECTIVE
|
| 148 |
+
#endif
|
| 149 |
+
|
| 150 |
+
// Include libcu++ where supported.
|
| 151 |
+
#if defined(_CG_CPP11_FEATURES) && !defined(__ibmxl__) && (!defined(_MSC_VER) || defined(_WIN64)) && \
|
| 152 |
+
!defined(_CG_LIMIT_INCLUDED_DEPENDENCIES)
|
| 153 |
+
# define _CG_USE_CUDA_STL
|
| 154 |
+
#else
|
| 155 |
+
# define _CG_USE_OWN_TRAITS
|
| 156 |
+
#endif
|
| 157 |
+
|
| 158 |
+
#if defined(_CG_USE_CUDA_STL) && !defined(__QNX__) && (!defined(__CUDA_ARCH__) || \
|
| 159 |
+
((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
|
| 160 |
+
# define _CG_HAS_STL_ATOMICS
|
| 161 |
+
#endif
|
| 162 |
+
|
| 163 |
+
#ifdef _CG_CPP11_FEATURES
|
| 164 |
+
// Use cuda::std:: for type_traits
|
| 165 |
+
# if defined(_CG_USE_CUDA_STL)
|
| 166 |
+
# define _CG_STL_NAMESPACE cuda::std
|
| 167 |
+
# include <cuda/std/type_traits>
|
| 168 |
+
// Use CG's implementation of type traits
|
| 169 |
+
# else
|
| 170 |
+
# define _CG_STL_NAMESPACE cooperative_groups::details::templates
|
| 171 |
+
# endif
|
| 172 |
+
#endif
|
| 173 |
+
|
| 174 |
+
#ifdef _CG_CPP11_FEATURES
|
| 175 |
+
# define _CG_STATIC_CONST_DECL static constexpr
|
| 176 |
+
# define _CG_CONST_DECL constexpr
|
| 177 |
+
#else
|
| 178 |
+
# define _CG_STATIC_CONST_DECL static const
|
| 179 |
+
# define _CG_CONST_DECL const
|
| 180 |
+
#endif
|
| 181 |
+
|
| 182 |
+
#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
|
| 183 |
+
# define _CG_ASM_PTR_CONSTRAINT "r"
|
| 184 |
+
#else
|
| 185 |
+
# define _CG_ASM_PTR_CONSTRAINT "l"
|
| 186 |
+
#endif
|
| 187 |
+
|
| 188 |
+
/*
|
| 189 |
+
** Define: CG_DEBUG
|
| 190 |
+
** What: Enables various runtime safety checks
|
| 191 |
+
*/
|
| 192 |
+
#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
|
| 193 |
+
# define _CG_DEBUG
|
| 194 |
+
#endif
|
| 195 |
+
|
| 196 |
+
#if defined(_CG_DEBUG)
|
| 197 |
+
# include <assert.h>
|
| 198 |
+
# define _CG_ASSERT(x) assert((x));
|
| 199 |
+
# define _CG_ABORT() assert(0);
|
| 200 |
+
#else
|
| 201 |
+
# define _CG_ASSERT(x)
|
| 202 |
+
# define _CG_ABORT() __trap();
|
| 203 |
+
#endif
|
| 204 |
+
|
| 205 |
+
_CG_BEGIN_NAMESPACE
|
| 206 |
+
|
| 207 |
+
namespace details {
|
| 208 |
+
_CG_STATIC_CONST_DECL unsigned int default_max_block_size = 1024;
|
| 209 |
+
|
| 210 |
+
#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
|
| 211 |
+
namespace templates {
|
| 212 |
+
|
| 213 |
+
/**
|
| 214 |
+
* Integral constants
|
| 215 |
+
**/
|
| 216 |
+
template <typename Ty, Ty Val>
|
| 217 |
+
struct integral_constant {
|
| 218 |
+
static constexpr Ty value = Val;
|
| 219 |
+
typedef Ty type;
|
| 220 |
+
|
| 221 |
+
_CG_QUALIFIER constexpr operator type() const noexcept { return value; }
|
| 222 |
+
_CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
|
| 223 |
+
};
|
| 224 |
+
|
| 225 |
+
typedef integral_constant<bool, true> true_type;
|
| 226 |
+
typedef integral_constant<bool, false> false_type;
|
| 227 |
+
|
| 228 |
+
/**
|
| 229 |
+
* CV Qualifiers
|
| 230 |
+
**/
|
| 231 |
+
template <class Ty> struct is_lvalue_reference : public details::templates::false_type {};
|
| 232 |
+
template <class Ty> struct is_lvalue_reference<Ty&> : public details::templates::true_type {};
|
| 233 |
+
|
| 234 |
+
template <class Ty> struct remove_reference {typedef Ty type;};
|
| 235 |
+
template <class Ty> struct remove_reference<Ty&> {typedef Ty type;};
|
| 236 |
+
template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
|
| 237 |
+
|
| 238 |
+
template <class Ty>
|
| 239 |
+
using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
|
| 240 |
+
|
| 241 |
+
template <class Ty> struct remove_const {typedef Ty type;};
|
| 242 |
+
template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
|
| 243 |
+
|
| 244 |
+
template <class Ty> struct remove_volatile {typedef Ty type;};
|
| 245 |
+
template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
|
| 246 |
+
|
| 247 |
+
template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
|
| 248 |
+
|
| 249 |
+
template <class Ty>
|
| 250 |
+
using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
|
| 251 |
+
|
| 252 |
+
template <class Ty>
|
| 253 |
+
_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
|
| 254 |
+
return static_cast<Ty&&>(t);
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
template <class Ty>
|
| 258 |
+
_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
|
| 259 |
+
static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
|
| 260 |
+
return static_cast<Ty&&>(t);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
/**
|
| 264 |
+
* is_integral
|
| 265 |
+
**/
|
| 266 |
+
template <class Ty> struct _is_integral : public details::templates::false_type {};
|
| 267 |
+
template <> struct _is_integral<bool> : public details::templates::true_type {};
|
| 268 |
+
template <> struct _is_integral<char> : public details::templates::true_type {};
|
| 269 |
+
template <> struct _is_integral<unsigned char> : public details::templates::true_type {};
|
| 270 |
+
template <> struct _is_integral<short> : public details::templates::true_type {};
|
| 271 |
+
template <> struct _is_integral<unsigned short> : public details::templates::true_type {};
|
| 272 |
+
template <> struct _is_integral<int> : public details::templates::true_type {};
|
| 273 |
+
template <> struct _is_integral<unsigned int> : public details::templates::true_type {};
|
| 274 |
+
template <> struct _is_integral<long> : public details::templates::true_type {};
|
| 275 |
+
template <> struct _is_integral<long long> : public details::templates::true_type {};
|
| 276 |
+
template <> struct _is_integral<unsigned long> : public details::templates::true_type {};
|
| 277 |
+
template <> struct _is_integral<unsigned long long> : public details::templates::true_type {};
|
| 278 |
+
//Vector type support?
|
| 279 |
+
|
| 280 |
+
template <typename Ty>
|
| 281 |
+
struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
|
| 282 |
+
|
| 283 |
+
/**
|
| 284 |
+
* is_floating_point
|
| 285 |
+
**/
|
| 286 |
+
template <class Ty> struct _is_floating_point : public details::templates::false_type {};
|
| 287 |
+
template <> struct _is_floating_point<float> : public details::templates::true_type {};
|
| 288 |
+
template <> struct _is_floating_point<double> : public details::templates::true_type {};
|
| 289 |
+
template <> struct _is_floating_point<long double> : public details::templates::true_type {};
|
| 290 |
+
# ifdef __CUDA_FP16_TYPES_EXIST__
|
| 291 |
+
template <> struct _is_floating_point<__half> : public details::templates::true_type {};
|
| 292 |
+
template <> struct _is_floating_point<__half2> : public details::templates::true_type {};
|
| 293 |
+
# endif
|
| 294 |
+
//Vector type support?
|
| 295 |
+
|
| 296 |
+
template <typename Ty>
|
| 297 |
+
struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
|
| 298 |
+
|
| 299 |
+
template <class T>
|
| 300 |
+
struct is_arithmetic : details::templates::integral_constant<
|
| 301 |
+
bool,
|
| 302 |
+
details::templates::is_integral<T>::value ||
|
| 303 |
+
details::templates::is_floating_point<T>::value> {};
|
| 304 |
+
|
| 305 |
+
template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
|
| 306 |
+
struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
|
| 307 |
+
|
| 308 |
+
template <typename Ty>
|
| 309 |
+
struct _is_unsigned<Ty,false> : details::templates::false_type {};
|
| 310 |
+
|
| 311 |
+
template <typename Ty>
|
| 312 |
+
struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
|
| 313 |
+
|
| 314 |
+
template <typename Ty> struct _is_pointer : public details::templates::false_type {};
|
| 315 |
+
template <typename Ty> struct _is_pointer<Ty*> : public details::templates::true_type {};
|
| 316 |
+
|
| 317 |
+
template <typename Ty>
|
| 318 |
+
struct is_pointer : _is_pointer<typename details::templates::remove_cv<Ty>::type> {};
|
| 319 |
+
|
| 320 |
+
/**
|
| 321 |
+
* programmatic type traits
|
| 322 |
+
**/
|
| 323 |
+
template<bool B, class Ty = void>
|
| 324 |
+
struct enable_if {};
|
| 325 |
+
|
| 326 |
+
template<class Ty>
|
| 327 |
+
struct enable_if<true, Ty> { typedef Ty type; };
|
| 328 |
+
|
| 329 |
+
template<bool Cond, typename Ty = void>
|
| 330 |
+
using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
|
| 331 |
+
|
| 332 |
+
template<class Ty1, class Ty2>
|
| 333 |
+
struct is_same : details::templates::false_type {};
|
| 334 |
+
|
| 335 |
+
template<class Ty>
|
| 336 |
+
struct is_same<Ty, Ty> : details::templates::true_type {};
|
| 337 |
+
|
| 338 |
+
} // templates
|
| 339 |
+
#endif // _CG_CPP11_FEATURES
|
| 340 |
+
|
| 341 |
+
} // details
|
| 342 |
+
_CG_END_NAMESPACE
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
#endif // _CG_INFO_H_
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CG_INVOKE_H
|
| 51 |
+
#define _CG_INVOKE_H
|
| 52 |
+
|
| 53 |
+
#include "info.h"
|
| 54 |
+
#include "helpers.h"
|
| 55 |
+
|
| 56 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename Group>
|
| 63 |
+
struct _elect_group_supported : _CG_STL_NAMESPACE::false_type {};
|
| 64 |
+
#ifdef _CG_HAS_INSTR_ELECT
|
| 65 |
+
template<>
|
| 66 |
+
struct _elect_group_supported<coalesced_group> : _CG_STL_NAMESPACE::true_type {};
|
| 67 |
+
template<unsigned int Size, typename Parent>
|
| 68 |
+
struct _elect_group_supported<thread_block_tile<Size, Parent>> :
|
| 69 |
+
_CG_STL_NAMESPACE::integral_constant<bool, (Size <= 32)> {};
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
template <typename Group>
|
| 73 |
+
struct elect_group_supported : public _elect_group_supported<details::remove_qual<Group>> {};
|
| 74 |
+
|
| 75 |
+
template<typename Group>
|
| 76 |
+
_CG_STATIC_QUALIFIER bool elect_one(const Group& group, unsigned int mask, unsigned int& leader_lane) {
|
| 77 |
+
int is_leader = 0;
|
| 78 |
+
#ifdef _CG_HAS_INSTR_ELECT
|
| 79 |
+
asm("{\n\t"
|
| 80 |
+
" .reg .pred p;\n\t"
|
| 81 |
+
" elect.sync %0|p, %2;\n\t"
|
| 82 |
+
" @p mov.s32 %1, 1;\n\t"
|
| 83 |
+
"}"
|
| 84 |
+
: "+r"(leader_lane), "+r"(is_leader) : "r" (mask));
|
| 85 |
+
#endif
|
| 86 |
+
return is_leader;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
template<bool UseElect>
|
| 90 |
+
struct invoke_one_impl {};
|
| 91 |
+
|
| 92 |
+
template<>
|
| 93 |
+
struct invoke_one_impl<true> {
|
| 94 |
+
template<typename Group, typename Fn, typename... Args>
|
| 95 |
+
_CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
|
| 96 |
+
auto mask = details::_coalesced_group_data_access::get_mask(group);
|
| 97 |
+
unsigned int leader_lane = 0;
|
| 98 |
+
|
| 99 |
+
if (elect_one(group, mask, leader_lane)) {
|
| 100 |
+
_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
template<typename Group, typename Fn, typename... Args>
|
| 105 |
+
_CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
|
| 106 |
+
-> typename _CG_STL_NAMESPACE::remove_reference<
|
| 107 |
+
decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
|
| 108 |
+
|
| 109 |
+
using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
|
| 110 |
+
details::remove_qual<ResultType> result;
|
| 111 |
+
auto mask = details::_coalesced_group_data_access::get_mask(group);
|
| 112 |
+
unsigned int leader_lane = 0;
|
| 113 |
+
|
| 114 |
+
if (elect_one(group, mask, leader_lane)) {
|
| 115 |
+
result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
// Need to use low level api instead of group.shfl, because elect_one returns lane id, not group rank.
|
| 119 |
+
return tile::shuffle_dispatch<ResultType>::shfl(result, mask, leader_lane, 32);
|
| 120 |
+
}
|
| 121 |
+
};
|
| 122 |
+
|
| 123 |
+
template<>
|
| 124 |
+
struct invoke_one_impl<false> {
|
| 125 |
+
template<typename Group, typename Fn, typename... Args>
|
| 126 |
+
_CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
|
| 127 |
+
if (group.thread_rank() == 0) {
|
| 128 |
+
_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
template<typename Group, typename Fn, typename... Args>
|
| 133 |
+
_CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
|
| 134 |
+
-> typename _CG_STL_NAMESPACE::remove_reference<
|
| 135 |
+
decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
|
| 136 |
+
|
| 137 |
+
using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
|
| 138 |
+
details::remove_qual<ResultType> result;
|
| 139 |
+
|
| 140 |
+
if (group.thread_rank() == 0) {
|
| 141 |
+
result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
return group.shfl(result, 0);
|
| 145 |
+
}
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
}; // namespace details
|
| 150 |
+
|
| 151 |
+
template<typename Group, typename Fn, typename... Args>
|
| 152 |
+
_CG_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
|
| 153 |
+
using impl = details::invoke_one_impl<details::elect_group_supported<Group>::value>;
|
| 154 |
+
impl::invoke_one(group, _CG_STL_NAMESPACE::forward<Fn>(fn), _CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
template<typename Fn, typename... Args>
|
| 158 |
+
_CG_QUALIFIER auto invoke_one_broadcast(const coalesced_group& group, Fn&& fn, Args&&... args)
|
| 159 |
+
-> typename _CG_STL_NAMESPACE::remove_reference<
|
| 160 |
+
decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
|
| 161 |
+
|
| 162 |
+
using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
|
| 163 |
+
static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
|
| 164 |
+
"For invocables returning void invoke_one should be used instead");
|
| 165 |
+
using impl = details::invoke_one_impl<details::elect_group_supported<coalesced_group>::value>;
|
| 166 |
+
return impl::invoke_one_broadcast(group,
|
| 167 |
+
_CG_STL_NAMESPACE::forward<Fn>(fn),
|
| 168 |
+
_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
template<unsigned int Size, typename Parent, typename Fn, typename... Args>
|
| 172 |
+
_CG_QUALIFIER auto invoke_one_broadcast(const thread_block_tile<Size, Parent>& group, Fn&& fn, Args&&... args)
|
| 173 |
+
-> typename _CG_STL_NAMESPACE::remove_reference<
|
| 174 |
+
decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
|
| 175 |
+
|
| 176 |
+
using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
|
| 177 |
+
static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
|
| 178 |
+
"For invocables returning void invoke_one should be used instead");
|
| 179 |
+
using impl = details::invoke_one_impl<details::elect_group_supported<thread_block_tile<Size, Parent>>::value>;
|
| 180 |
+
return impl::invoke_one_broadcast(group,
|
| 181 |
+
_CG_STL_NAMESPACE::forward<Fn>(fn),
|
| 182 |
+
_CG_STL_NAMESPACE::forward<Args>(args)...);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
_CG_END_NAMESPACE
|
| 186 |
+
|
| 187 |
+
#endif //_CG_CPP11_FEATURES
|
| 188 |
+
|
| 189 |
+
#endif // _CG_INVOKE_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_MEMORY_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
|
| 54 |
+
_CG_BEGIN_NAMESPACE
|
| 55 |
+
|
| 56 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 57 |
+
namespace details {
|
| 58 |
+
_CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
|
| 59 |
+
|
| 60 |
+
// Should only be called for SM80+
|
| 61 |
+
_CG_STATIC_QUALIFIER void* reserved_shared_ptr()
|
| 62 |
+
{
|
| 63 |
+
unsigned long long ptr = 0;
|
| 64 |
+
NV_IF_TARGET(NV_PROVIDES_SM_80,
|
| 65 |
+
(asm ("{\n\t"
|
| 66 |
+
" .reg .u32 start;\n\t"
|
| 67 |
+
" .reg .u64 extended;\n\t"
|
| 68 |
+
" mov.u32 start, %%reserved_smem_offset_1;\n\t"
|
| 69 |
+
" cvt.u64.u32 extended, start;\n\t"
|
| 70 |
+
" cvta.shared.u64 %0, extended;\n\t"
|
| 71 |
+
"}"
|
| 72 |
+
: "=l"(ptr));)
|
| 73 |
+
)
|
| 74 |
+
return reinterpret_cast<void*>(ptr);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
struct multi_warp_scratch {
|
| 78 |
+
// One barrier per possible size of the group.
|
| 79 |
+
_CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
|
| 80 |
+
_CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
|
| 81 |
+
|
| 82 |
+
using communication_type = unsigned long long;
|
| 83 |
+
_CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
|
| 84 |
+
|
| 85 |
+
// Layout of the scratch space:
|
| 86 |
+
barrier_t barriers[memory_barriers_count];
|
| 87 |
+
char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
|
| 88 |
+
communication_type communication_memory[default_max_block_size / 32];
|
| 89 |
+
|
| 90 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
|
| 91 |
+
// One slot of collectives memory per warp.
|
| 92 |
+
return scratch_num_reserved_bytes + (unsigned int)sync_memory_size + max_block_size / 32 * (unsigned int)communication_size;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
_CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
|
| 96 |
+
if (thread_rank < memory_barriers_count) {
|
| 97 |
+
barriers[thread_rank] = 0;
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
};
|
| 101 |
+
|
| 102 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 103 |
+
// CG can expect at least 288 bytes available in reserved shared
|
| 104 |
+
static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
// Make sure the structure can fit into the user provided memory
|
| 108 |
+
static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
|
| 109 |
+
"multi-warp scratch size is too large");
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
_CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
|
| 113 |
+
void *ptr;
|
| 114 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
|
| 115 |
+
(ptr = reserved_shared_ptr();)
|
| 116 |
+
,
|
| 117 |
+
(ptr = user_scratch;)
|
| 118 |
+
)
|
| 119 |
+
return static_cast<multi_warp_scratch*>(ptr);
|
| 120 |
+
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
template <unsigned int MaxBlockSize = details::default_max_block_size>
|
| 126 |
+
struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
|
| 127 |
+
private:
|
| 128 |
+
#if !defined(_CG_HAS_RESERVED_SHARED)
|
| 129 |
+
char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
|
| 130 |
+
#endif
|
| 131 |
+
};
|
| 132 |
+
#endif
|
| 133 |
+
|
| 134 |
+
_CG_END_NAMESPACE
|
| 135 |
+
|
| 136 |
+
#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CG_PARTITIONING_H
|
| 51 |
+
#define _CG_PARTITIONING_H
|
| 52 |
+
|
| 53 |
+
#include "info.h"
|
| 54 |
+
#include "helpers.h"
|
| 55 |
+
|
| 56 |
+
_CG_BEGIN_NAMESPACE
|
| 57 |
+
|
| 58 |
+
namespace details {
|
| 59 |
+
|
| 60 |
+
template <typename TyGroup>
|
| 61 |
+
_CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
|
| 62 |
+
const unsigned int fullMask = ~0u;
|
| 63 |
+
|
| 64 |
+
unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
|
| 65 |
+
unsigned int predMask = pred ? 0 : fullMask;
|
| 66 |
+
unsigned int setMask = __ballot_sync(thisMask, pred);
|
| 67 |
+
|
| 68 |
+
if (setMask == thisMask || setMask == 0) {
|
| 69 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
|
| 70 |
+
_coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
|
| 71 |
+
return subTile;
|
| 72 |
+
}
|
| 73 |
+
else {
|
| 74 |
+
unsigned int subMask = thisMask & (setMask ^ predMask);
|
| 75 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
|
| 76 |
+
_coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
|
| 77 |
+
return subTile;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
|
| 82 |
+
template <typename TyPredicate>
|
| 83 |
+
struct _labeled_partition_dispatch {
|
| 84 |
+
template <typename TyGroup>
|
| 85 |
+
_CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate pred) {
|
| 86 |
+
unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
|
| 87 |
+
unsigned int subMask = __match_any_sync(thisMask, pred);
|
| 88 |
+
unsigned int laneId = details::laneid();
|
| 89 |
+
|
| 90 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
|
| 91 |
+
|
| 92 |
+
int leaderLaneId = __ffs(subMask) - 1;
|
| 93 |
+
bool isLeader = leaderLaneId == laneId;
|
| 94 |
+
unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
|
| 95 |
+
|
| 96 |
+
// Count leaders with lower laneid, that will be the meta rank of this tile
|
| 97 |
+
unsigned int tileRank = __popc(leaderMask & ((1 << leaderLaneId) - 1));
|
| 98 |
+
|
| 99 |
+
_coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
|
| 100 |
+
|
| 101 |
+
return subTile;
|
| 102 |
+
}
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
template <>
|
| 106 |
+
struct _labeled_partition_dispatch<bool> {
|
| 107 |
+
template <typename TyGroup>
|
| 108 |
+
_CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, bool pred) {
|
| 109 |
+
return _binary_partition(tile, pred);
|
| 110 |
+
}
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
template <typename TyPredicate>
|
| 114 |
+
struct _labeled_partition_dispatch<TyPredicate*> {
|
| 115 |
+
template <typename TyGroup>
|
| 116 |
+
_CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate* pred) {
|
| 117 |
+
auto impl = _labeled_partition_dispatch<unsigned long long>();
|
| 118 |
+
return impl(tile, reinterpret_cast<unsigned long long>(pred));
|
| 119 |
+
}
|
| 120 |
+
};
|
| 121 |
+
#endif
|
| 122 |
+
}; // namespace details
|
| 123 |
+
|
| 124 |
+
_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
|
| 125 |
+
return details::_binary_partition(tile, pred);
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
template <unsigned int Size, typename ParentT>
|
| 129 |
+
_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
|
| 130 |
+
#ifdef _CG_CPP11_FEATURES
|
| 131 |
+
static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
|
| 132 |
+
#endif
|
| 133 |
+
return details::_binary_partition(tile, pred);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
|
| 138 |
+
template <typename TyPredicate>
|
| 139 |
+
_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
|
| 140 |
+
static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
|
| 141 |
+
_CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
|
| 142 |
+
"labeled_partition predicate must be an integral or pointer type");
|
| 143 |
+
auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
|
| 144 |
+
return dispatch(tile, pred);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
template <typename TyPredicate, unsigned int Size, typename ParentT>
|
| 148 |
+
_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
|
| 149 |
+
static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
|
| 150 |
+
_CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
|
| 151 |
+
"labeled_partition predicate must be an integral or pointer type");
|
| 152 |
+
static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
|
| 153 |
+
auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
|
| 154 |
+
return dispatch(tile, pred);
|
| 155 |
+
}
|
| 156 |
+
#endif
|
| 157 |
+
|
| 158 |
+
_CG_END_NAMESPACE
|
| 159 |
+
|
| 160 |
+
#endif // _CG_PARTITIONING_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h
ADDED
|
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_REDUCE_H_
|
| 50 |
+
#define _CG_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "coalesced_reduce.h"
|
| 55 |
+
#include "functional.h"
|
| 56 |
+
#include "cooperative_groups.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <class Ty>
|
| 63 |
+
using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
|
| 64 |
+
bool,
|
| 65 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
|
| 66 |
+
|
| 67 |
+
template <class Ty>
|
| 68 |
+
using redux_is_add_supported = _redux_is_add_supported<Ty>;
|
| 69 |
+
|
| 70 |
+
// A specialization for 64 bit logical operations is possible
|
| 71 |
+
// but for now only accelerate 32 bit bitwise ops
|
| 72 |
+
template <class Ty>
|
| 73 |
+
using redux_is_logical_supported = redux_is_add_supported<Ty>;
|
| 74 |
+
|
| 75 |
+
// Base operator support case
|
| 76 |
+
template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 77 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 78 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 79 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 80 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 81 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 82 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 83 |
+
|
| 84 |
+
template <class Ty, template <class> class TyOp>
|
| 85 |
+
using redux_op_supported = _redux_op_supported<
|
| 86 |
+
typename details::remove_qual<TyOp<Ty>>,
|
| 87 |
+
Ty>;
|
| 88 |
+
|
| 89 |
+
// Groups smaller than 16 actually have worse performance characteristics when used with redux
|
| 90 |
+
// tiles of size 16 and 32 perform the same or better and have better code generation profiles
|
| 91 |
+
template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
|
| 92 |
+
|
| 93 |
+
template <unsigned int Sz, typename TyPar>
|
| 94 |
+
struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 95 |
+
bool,
|
| 96 |
+
(Sz >= 16)> {};
|
| 97 |
+
template <unsigned int Sz, typename TyPar>
|
| 98 |
+
struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 99 |
+
bool,
|
| 100 |
+
(Sz >= 16)> {};
|
| 101 |
+
template <>
|
| 102 |
+
struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 103 |
+
|
| 104 |
+
template <typename TyGroup>
|
| 105 |
+
using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
|
| 106 |
+
|
| 107 |
+
template <template <class> class TyOp>
|
| 108 |
+
_CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
|
| 109 |
+
template <template <class> class TyOp>
|
| 110 |
+
_CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
|
| 111 |
+
|
| 112 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
|
| 113 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
|
| 114 |
+
}
|
| 115 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
|
| 116 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
|
| 117 |
+
}
|
| 118 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
|
| 119 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
|
| 120 |
+
}
|
| 121 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
|
| 122 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_and_sync(mask, val));), return 0;)
|
| 123 |
+
}
|
| 124 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
|
| 125 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_xor_sync(mask, val));), return 0;)
|
| 126 |
+
}
|
| 127 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
|
| 128 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_or_sync(mask, val));), return 0;)
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
|
| 132 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
|
| 133 |
+
}
|
| 134 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
|
| 135 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
|
| 136 |
+
}
|
| 137 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
|
| 138 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
|
| 139 |
+
}
|
| 140 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
|
| 141 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_and_sync(mask, val);), return 0;)
|
| 142 |
+
}
|
| 143 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
|
| 144 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_xor_sync(mask, val);), return 0;)
|
| 145 |
+
}
|
| 146 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
|
| 147 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_or_sync(mask, val);), return 0;)
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
|
| 151 |
+
struct _accelerated_op;
|
| 152 |
+
|
| 153 |
+
// Signed type redux intrinsic dispatch
|
| 154 |
+
template <typename TyVal>
|
| 155 |
+
struct _accelerated_op<TyVal, false> {
|
| 156 |
+
template <template <class> class TyOp>
|
| 157 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 158 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
|
| 159 |
+
}
|
| 160 |
+
};
|
| 161 |
+
|
| 162 |
+
// Unsigned type redux intrinsic dispatch
|
| 163 |
+
template <typename TyVal>
|
| 164 |
+
struct _accelerated_op<TyVal, true> {
|
| 165 |
+
template <template <class> class TyOp>
|
| 166 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 167 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
|
| 168 |
+
}
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
template <typename TyVal>
|
| 172 |
+
using accelerated_op = _accelerated_op<TyVal>;
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
template <typename TyVal, typename TyFnInput, typename TyGroup>
|
| 176 |
+
class _redux_dispatch {
|
| 177 |
+
template <class Ty, template <class> class TyOp>
|
| 178 |
+
using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 179 |
+
redux_op_supported<Ty, TyOp>::value &&
|
| 180 |
+
redux_group_optimized<TyGroup>::value>;
|
| 181 |
+
|
| 182 |
+
template <class Ty, template <class> class TyOp>
|
| 183 |
+
using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 184 |
+
|
| 185 |
+
template <class Ty, template <class> class TyOp>
|
| 186 |
+
using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 187 |
+
|
| 188 |
+
public:
|
| 189 |
+
// Dispatch to redux if the combination of op and args are supported
|
| 190 |
+
template<
|
| 191 |
+
template <class> class TyOp,
|
| 192 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 193 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 194 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
|
| 195 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 196 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 197 |
+
,
|
| 198 |
+
// Arch does not support redux, fallback to shuffles
|
| 199 |
+
return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 200 |
+
)
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<
|
| 204 |
+
template <class> class TyOp,
|
| 205 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 206 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 207 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
|
| 208 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 209 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 210 |
+
,
|
| 211 |
+
// Arch does not support redux, fallback to shuffles
|
| 212 |
+
return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 213 |
+
)
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
// Fallback shuffle sync reduction
|
| 217 |
+
template <
|
| 218 |
+
template <class> class TyOp,
|
| 219 |
+
redux_is_not_usable<TyFnInput, TyOp> = nullptr>
|
| 220 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 221 |
+
//Dispatch to fallback shuffle sync accelerated reduction
|
| 222 |
+
return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
};
|
| 226 |
+
|
| 227 |
+
// Group support for reduce.
|
| 228 |
+
template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 229 |
+
|
| 230 |
+
template <unsigned int Sz, typename TyPar>
|
| 231 |
+
struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 232 |
+
template <unsigned int Sz, typename TyPar>
|
| 233 |
+
struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 234 |
+
template <>
|
| 235 |
+
struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 236 |
+
|
| 237 |
+
template <typename TyGroup>
|
| 238 |
+
using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
|
| 239 |
+
|
| 240 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 241 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 242 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 243 |
+
|
| 244 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 245 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 249 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 250 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 251 |
+
|
| 252 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 253 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
template <typename TyVal, typename TyOp, typename TyGroup>
|
| 258 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 259 |
+
return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
template <unsigned int GroupId>
|
| 263 |
+
struct tile_reduce_dispatch;
|
| 264 |
+
|
| 265 |
+
template <>
|
| 266 |
+
struct tile_reduce_dispatch<details::coalesced_group_id> {
|
| 267 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 268 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 269 |
+
return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 270 |
+
}
|
| 271 |
+
};
|
| 272 |
+
|
| 273 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 274 |
+
template <>
|
| 275 |
+
struct tile_reduce_dispatch<details::multi_tile_group_id> {
|
| 276 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 277 |
+
_CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 278 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 279 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 280 |
+
const unsigned int num_warps = Size / 32;
|
| 281 |
+
|
| 282 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 283 |
+
*warp_scratch_location =
|
| 284 |
+
details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 285 |
+
};
|
| 286 |
+
auto inter_warp_lambda =
|
| 287 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 288 |
+
*thread_scratch_location =
|
| 289 |
+
details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 290 |
+
};
|
| 291 |
+
return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 292 |
+
}
|
| 293 |
+
};
|
| 294 |
+
|
| 295 |
+
template <unsigned int GroupId>
|
| 296 |
+
struct tile_async_reduce_dispatch;
|
| 297 |
+
|
| 298 |
+
template <>
|
| 299 |
+
struct tile_async_reduce_dispatch<details::coalesced_group_id> {
|
| 300 |
+
template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
|
| 301 |
+
_CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
|
| 302 |
+
// Do regular, in group reduction
|
| 303 |
+
auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 304 |
+
|
| 305 |
+
// One thread stores/updates the destination
|
| 306 |
+
if (group.thread_rank() == 0) {
|
| 307 |
+
res_handler(result);
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
};
|
| 311 |
+
|
| 312 |
+
template <>
|
| 313 |
+
struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
|
| 314 |
+
template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
|
| 315 |
+
_CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
|
| 316 |
+
using TyVal = remove_qual<TyInputVal>;
|
| 317 |
+
const unsigned int num_warps = TySize / 32;
|
| 318 |
+
details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
|
| 319 |
+
auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
|
| 320 |
+
|
| 321 |
+
// Do in warp reduce
|
| 322 |
+
auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
|
| 323 |
+
*warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
|
| 324 |
+
|
| 325 |
+
// Tile of size num_warps from the last warp to arrive does final reduction step
|
| 326 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
|
| 327 |
+
auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
|
| 328 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 329 |
+
auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
|
| 330 |
+
auto thread_val = *thread_scratch_location;
|
| 331 |
+
// Release other warps, we read their contribution already.
|
| 332 |
+
subwarp.sync();
|
| 333 |
+
details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
|
| 334 |
+
TyVal result = details::reduce(subwarp, thread_val, op);
|
| 335 |
+
// One thread stores the result or updates the atomic
|
| 336 |
+
if (subwarp.thread_rank() == 0) {
|
| 337 |
+
res_handler(result);
|
| 338 |
+
}
|
| 339 |
+
}
|
| 340 |
+
warp.sync();
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
};
|
| 344 |
+
#endif
|
| 345 |
+
|
| 346 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 347 |
+
_CG_QUALIFIER void check_reduce_params() {
|
| 348 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 349 |
+
static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 350 |
+
};
|
| 351 |
+
|
| 352 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 353 |
+
_CG_QUALIFIER void check_async_reduce_params() {
|
| 354 |
+
check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
|
| 355 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 356 |
+
}
|
| 357 |
+
} // details
|
| 358 |
+
|
| 359 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 360 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 361 |
+
details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
|
| 362 |
+
|
| 363 |
+
using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
|
| 364 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 368 |
+
|
| 369 |
+
# if defined(_CG_HAS_STL_ATOMICS)
|
| 370 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 371 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 372 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 373 |
+
auto update_lambda = [&] (TyVal& result) {
|
| 374 |
+
details::atomic_update(dst, result, op);
|
| 375 |
+
};
|
| 376 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 377 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 381 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 382 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 383 |
+
auto update_lambda = [&] (TyVal& result) {
|
| 384 |
+
details::atomic_update(dst, result, op);
|
| 385 |
+
};
|
| 386 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 387 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 391 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 392 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 393 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 394 |
+
details::atomic_store(dst, result);
|
| 395 |
+
};
|
| 396 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 397 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 401 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 402 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 403 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 404 |
+
details::atomic_store(dst, result);
|
| 405 |
+
};
|
| 406 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 407 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 408 |
+
}
|
| 409 |
+
# endif
|
| 410 |
+
|
| 411 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
|
| 412 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
|
| 413 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 414 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 415 |
+
*dst = result;
|
| 416 |
+
};
|
| 417 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 418 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 419 |
+
}
|
| 420 |
+
#endif
|
| 421 |
+
|
| 422 |
+
_CG_END_NAMESPACE
|
| 423 |
+
|
| 424 |
+
#endif // _CG_REDUCE_H_
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_SCAN_H_
|
| 50 |
+
#define _CG_SCAN_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "functional.h"
|
| 55 |
+
#include "coalesced_scan.h"
|
| 56 |
+
|
| 57 |
+
_CG_BEGIN_NAMESPACE
|
| 58 |
+
|
| 59 |
+
namespace details {
|
| 60 |
+
|
| 61 |
+
// Group support for scan.
|
| 62 |
+
template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 63 |
+
|
| 64 |
+
template <unsigned int Sz, typename TyPar>
|
| 65 |
+
struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 66 |
+
template <unsigned int Sz, typename TyPar>
|
| 67 |
+
struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 68 |
+
template <>
|
| 69 |
+
struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 70 |
+
|
| 71 |
+
template <typename TyGroup>
|
| 72 |
+
using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
|
| 73 |
+
|
| 74 |
+
template <bool IsIntegralPlus>
|
| 75 |
+
struct integral_optimized_scan;
|
| 76 |
+
|
| 77 |
+
enum class ScanType { exclusive, inclusive };
|
| 78 |
+
|
| 79 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 80 |
+
struct scan_dispatch;
|
| 81 |
+
|
| 82 |
+
template <ScanType TyScan>
|
| 83 |
+
struct scan_dispatch<details::coalesced_group_id, TyScan> {
|
| 84 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 85 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 86 |
+
auto scan_result = coalesced_inclusive_scan(group, val, op);
|
| 87 |
+
if (TyScan == ScanType::exclusive) {
|
| 88 |
+
scan_result = convert_inclusive_to_exclusive(group,
|
| 89 |
+
scan_result,
|
| 90 |
+
_CG_STL_NAMESPACE::forward<TyVal>(val),
|
| 91 |
+
_CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 92 |
+
}
|
| 93 |
+
return scan_result;
|
| 94 |
+
}
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 98 |
+
template <ScanType TyScan>
|
| 99 |
+
struct scan_dispatch<details::multi_tile_group_id, TyScan> {
|
| 100 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 101 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 102 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 103 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 104 |
+
const unsigned int num_warps = Size / 32;
|
| 105 |
+
// In warp scan result, calculated in warp_lambda
|
| 106 |
+
TyRet warp_scan;
|
| 107 |
+
|
| 108 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 109 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 110 |
+
warp_scan =
|
| 111 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 112 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 113 |
+
*warp_scratch_location = warp_scan;
|
| 114 |
+
}
|
| 115 |
+
if (TyScan == ScanType::exclusive) {
|
| 116 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 117 |
+
}
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 121 |
+
// to its in-warp scan result
|
| 122 |
+
auto inter_warp_lambda =
|
| 123 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 124 |
+
auto thread_val = *thread_scratch_location;
|
| 125 |
+
auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 126 |
+
*thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
|
| 127 |
+
};
|
| 128 |
+
|
| 129 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 130 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 131 |
+
return previous_warps_sum;
|
| 132 |
+
}
|
| 133 |
+
if (warpType::meta_group_rank() == 0) {
|
| 134 |
+
return warp_scan;
|
| 135 |
+
}
|
| 136 |
+
else {
|
| 137 |
+
return op(warp_scan, previous_warps_sum);
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 143 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 144 |
+
struct scan_update_dispatch;
|
| 145 |
+
|
| 146 |
+
template <ScanType TyScan>
|
| 147 |
+
struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
|
| 148 |
+
template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
|
| 149 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 150 |
+
details::remove_qual<TyVal> old;
|
| 151 |
+
|
| 152 |
+
// Do regular in group scan
|
| 153 |
+
auto scan_result = details::coalesced_inclusive_scan(group, val, op);
|
| 154 |
+
|
| 155 |
+
// Last thread updates the atomic and distributes its old value to other threads
|
| 156 |
+
if (group.thread_rank() == group.size() - 1) {
|
| 157 |
+
old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 158 |
+
}
|
| 159 |
+
old = group.shfl(old, group.size() - 1);
|
| 160 |
+
if (TyScan == ScanType::exclusive) {
|
| 161 |
+
scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 162 |
+
}
|
| 163 |
+
scan_result = op(old, scan_result);
|
| 164 |
+
return scan_result;
|
| 165 |
+
}
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
template <ScanType TyScan>
|
| 169 |
+
struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
|
| 170 |
+
template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
|
| 171 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 172 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 173 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 174 |
+
const unsigned int num_warps = Size / 32;
|
| 175 |
+
// In warp scan result, calculated in warp_lambda
|
| 176 |
+
TyRet warp_scan;
|
| 177 |
+
|
| 178 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 179 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 180 |
+
warp_scan =
|
| 181 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 182 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 183 |
+
*warp_scratch_location = warp_scan;
|
| 184 |
+
}
|
| 185 |
+
if (TyScan == ScanType::exclusive) {
|
| 186 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 191 |
+
// to its in-warp scan result
|
| 192 |
+
auto inter_warp_lambda =
|
| 193 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 194 |
+
auto thread_val = *thread_scratch_location;
|
| 195 |
+
auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 196 |
+
TyRet offset;
|
| 197 |
+
// Single thread does the atomic update with sum of all contributions and reads the old value.
|
| 198 |
+
if (subwarp.thread_rank() == subwarp.size() - 1) {
|
| 199 |
+
offset = details::atomic_update(dst, scan_result, op);
|
| 200 |
+
}
|
| 201 |
+
offset = subwarp.shfl(offset, subwarp.size() - 1);
|
| 202 |
+
scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
|
| 203 |
+
// Add offset read from the atomic to the scanned warp sum.
|
| 204 |
+
// Skipping first thread, since it got defautly constructed value from the conversion,
|
| 205 |
+
// it should just return the offset received from the thread that did the atomic update.
|
| 206 |
+
if (subwarp.thread_rank() != 0) {
|
| 207 |
+
offset = op(scan_result, offset);
|
| 208 |
+
}
|
| 209 |
+
*thread_scratch_location = offset;
|
| 210 |
+
};
|
| 211 |
+
|
| 212 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 213 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 214 |
+
return previous_warps_sum;
|
| 215 |
+
}
|
| 216 |
+
return op(warp_scan, previous_warps_sum);
|
| 217 |
+
}
|
| 218 |
+
};
|
| 219 |
+
#endif
|
| 220 |
+
#endif
|
| 221 |
+
|
| 222 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 223 |
+
_CG_QUALIFIER void check_scan_params() {
|
| 224 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 225 |
+
static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 229 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 230 |
+
_CG_QUALIFIER void check_scan_update_params() {
|
| 231 |
+
check_scan_params<TyGroup, TyInputVal, TyRetVal>();
|
| 232 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 233 |
+
}
|
| 234 |
+
#endif
|
| 235 |
+
|
| 236 |
+
} // details
|
| 237 |
+
|
| 238 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 239 |
+
_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 240 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 241 |
+
|
| 242 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 243 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
template <typename TyGroup, typename TyVal>
|
| 247 |
+
_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 248 |
+
return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 252 |
+
_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 253 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 254 |
+
|
| 255 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 256 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template <typename TyGroup, typename TyVal>
|
| 260 |
+
_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 261 |
+
return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 265 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 266 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 267 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 268 |
+
|
| 269 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 270 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 274 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 275 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 279 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 280 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 281 |
+
|
| 282 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 283 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 287 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 288 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 292 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 293 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 294 |
+
|
| 295 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 296 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 300 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 301 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 305 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 306 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 307 |
+
|
| 308 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 309 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 313 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 314 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 315 |
+
}
|
| 316 |
+
#endif
|
| 317 |
+
|
| 318 |
+
_CG_END_NAMESPACE
|
| 319 |
+
|
| 320 |
+
#endif // _CG_SCAN_H_
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_GRID_H
|
| 50 |
+
#define _CG_GRID_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
|
| 54 |
+
_CG_BEGIN_NAMESPACE
|
| 55 |
+
|
| 56 |
+
namespace details
|
| 57 |
+
{
|
| 58 |
+
typedef unsigned int barrier_t;
|
| 59 |
+
|
| 60 |
+
_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
|
| 61 |
+
return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
_CG_STATIC_QUALIFIER bool is_cta_master() {
|
| 65 |
+
return (threadIdx.x + threadIdx.y + threadIdx.z == 0);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
_CG_STATIC_QUALIFIER unsigned int sync_grids_arrive(volatile barrier_t *arrived) {
|
| 69 |
+
unsigned int oldArrive = 0;
|
| 70 |
+
|
| 71 |
+
__barrier_sync(0);
|
| 72 |
+
|
| 73 |
+
if (is_cta_master()) {
|
| 74 |
+
unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
|
| 75 |
+
bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
|
| 76 |
+
unsigned int nb = 1;
|
| 77 |
+
|
| 78 |
+
if (gpu_master) {
|
| 79 |
+
nb = 0x80000000 - (expected - 1);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 83 |
+
// Barrier update with release; polling with acquire
|
| 84 |
+
asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(oldArrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb) : "memory");
|
| 85 |
+
,
|
| 86 |
+
// Fence; barrier update; volatile polling; fence
|
| 87 |
+
__threadfence();
|
| 88 |
+
oldArrive = atomicAdd((unsigned int*)arrived, nb);
|
| 89 |
+
);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
return oldArrive;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
_CG_STATIC_QUALIFIER void sync_grids_wait(unsigned int oldArrive, volatile barrier_t *arrived) {
|
| 97 |
+
if (is_cta_master()) {
|
| 98 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 99 |
+
unsigned int current_arrive;
|
| 100 |
+
do {
|
| 101 |
+
asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(current_arrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int *)arrived) : "memory");
|
| 102 |
+
} while (!bar_has_flipped(oldArrive, current_arrive));
|
| 103 |
+
,
|
| 104 |
+
while (!bar_has_flipped(oldArrive, *arrived));
|
| 105 |
+
__threadfence();
|
| 106 |
+
);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
__barrier_sync(0);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
/* - Multi warp groups synchronization routines - */
|
| 113 |
+
|
| 114 |
+
#ifdef _CG_CPP11_FEATURES
|
| 115 |
+
// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
|
| 116 |
+
_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
|
| 117 |
+
unsigned int old;
|
| 118 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 119 |
+
(asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
|
| 120 |
+
,
|
| 121 |
+
(__threadfence_block();
|
| 122 |
+
old = atomicOr(addr, val);)
|
| 123 |
+
);
|
| 124 |
+
return old;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
// Special case where barrier is arrived, but not waited on
|
| 128 |
+
_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
|
| 129 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 130 |
+
(asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
|
| 131 |
+
,
|
| 132 |
+
(__threadfence_block();
|
| 133 |
+
atomicOr(addr, val);)
|
| 134 |
+
);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
|
| 138 |
+
_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
|
| 139 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 140 |
+
(asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
|
| 141 |
+
,
|
| 142 |
+
(atomicAnd(addr, val);)
|
| 143 |
+
);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// Special case of release, where last warp was doing extra work before releasing others, need to be release
|
| 147 |
+
// to ensure that extra work is visible
|
| 148 |
+
_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
|
| 149 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 150 |
+
(asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
|
| 151 |
+
,
|
| 152 |
+
(__threadfence_block();
|
| 153 |
+
atomicAnd(addr, val);)
|
| 154 |
+
);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
|
| 158 |
+
_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
|
| 159 |
+
unsigned int val;
|
| 160 |
+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
|
| 161 |
+
(asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");)
|
| 162 |
+
,
|
| 163 |
+
(val = *((volatile unsigned int*) addr);
|
| 164 |
+
__threadfence_block();)
|
| 165 |
+
);
|
| 166 |
+
return val;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
|
| 170 |
+
// thread ranks 32..63 second etc
|
| 171 |
+
// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions
|
| 172 |
+
_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
|
| 173 |
+
return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
|
| 177 |
+
while(ld_acquire_cta(arrived) & warp_bit);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
// Default blocking sync.
|
| 181 |
+
_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
|
| 182 |
+
unsigned int warp_id = thread_rank / 32;
|
| 183 |
+
bool warp_master = (thread_rank % 32 == 0);
|
| 184 |
+
unsigned int warp_bit = 1 << warp_id;
|
| 185 |
+
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
|
| 186 |
+
|
| 187 |
+
__syncwarp(0xFFFFFFFF);
|
| 188 |
+
|
| 189 |
+
if (warp_master) {
|
| 190 |
+
unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
|
| 191 |
+
if (((old | warp_bit) & group_mask) == group_mask) {
|
| 192 |
+
red_and_relaxed_cta(arrived, ~group_mask);
|
| 193 |
+
}
|
| 194 |
+
else {
|
| 195 |
+
barrier_wait(arrived, warp_bit);
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
__syncwarp(0xFFFFFFFF);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
|
| 203 |
+
// Warp returning true from this function needs to call sync_warps_release.
|
| 204 |
+
_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
|
| 205 |
+
unsigned int warp_id = thread_rank / 32;
|
| 206 |
+
bool warp_master = (thread_rank % 32 == 0);
|
| 207 |
+
unsigned int warp_bit = 1 << warp_id;
|
| 208 |
+
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
|
| 209 |
+
|
| 210 |
+
__syncwarp(0xFFFFFFFF);
|
| 211 |
+
|
| 212 |
+
unsigned int old = 0;
|
| 213 |
+
if (warp_master) {
|
| 214 |
+
old = atom_or_acq_rel_cta(arrived, warp_bit);
|
| 215 |
+
}
|
| 216 |
+
old = __shfl_sync(0xFFFFFFFF, old, 0);
|
| 217 |
+
if (((old | warp_bit) & group_mask) == group_mask) {
|
| 218 |
+
return true;
|
| 219 |
+
}
|
| 220 |
+
barrier_wait(arrived, warp_bit);
|
| 221 |
+
|
| 222 |
+
return false;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// Release my group from the barrier.
|
| 226 |
+
_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
|
| 227 |
+
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
|
| 228 |
+
if (is_master) {
|
| 229 |
+
red_and_release_cta(arrived, ~group_mask);
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
|
| 234 |
+
// sync_warps_release needs to be called by some warp after this one to reset the barrier.
|
| 235 |
+
_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
|
| 236 |
+
unsigned int warp_id = thread_rank / 32;
|
| 237 |
+
bool warp_master = (thread_rank % 32 == 0);
|
| 238 |
+
unsigned int warp_bit = 1 << warp_id;
|
| 239 |
+
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
|
| 240 |
+
|
| 241 |
+
__syncwarp(0xFFFFFFFF);
|
| 242 |
+
|
| 243 |
+
if (warp_master) {
|
| 244 |
+
red_or_release_cta(arrived, warp_bit);
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
// Wait for my warp to be released from the barrier. Warp must have arrived first.
|
| 249 |
+
_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
|
| 250 |
+
unsigned int warp_id = thread_rank / 32;
|
| 251 |
+
unsigned int warp_bit = 1 << warp_id;
|
| 252 |
+
|
| 253 |
+
barrier_wait(arrived, warp_bit);
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
// Wait for specific warp to arrive at the barrier
|
| 257 |
+
_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
|
| 258 |
+
unsigned int wait_mask = 1 << wait_warp_id;
|
| 259 |
+
while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
// Initialize the bit corresponding to my warp in the barrier
|
| 263 |
+
_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
|
| 264 |
+
unsigned int warp_id = thread_rank / 32;
|
| 265 |
+
unsigned int warp_bit = 1 << warp_id;
|
| 266 |
+
|
| 267 |
+
__syncwarp(0xFFFFFFFF);
|
| 268 |
+
|
| 269 |
+
if (thread_rank % 32 == 0) {
|
| 270 |
+
red_and_release_cta(arrived, ~warp_bit);
|
| 271 |
+
}
|
| 272 |
+
// No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
#endif
|
| 276 |
+
|
| 277 |
+
} // details
|
| 278 |
+
|
| 279 |
+
_CG_END_NAMESPACE
|
| 280 |
+
|
| 281 |
+
#endif // _CG_GRID_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 50 |
+
#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/async.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_REDUCE_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_REDUCE_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/reduce.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_REDUCE_H
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_SCAN_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_SCAN_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/scan.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_SCAN_H
|