BryanW commited on
Commit
43f5fc5
·
verified ·
1 Parent(s): 55565d2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal/__init__.pyi +12 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/annotated_doc-0.0.4.dist-info/licenses/LICENSE +21 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__init__.cpython-312.pyc +0 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__main__.cpython-312.pyc +0 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/core.cpython-312.pyc +0 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/__pycache__/__init__.cpython-312.pyc +0 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__init__.py +0 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__pycache__/__init__.cpython-312.pyc +0 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__init__.py +0 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-312.pyc +0 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas.h +891 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasLt.h +0 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_api.h +0 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_v2.h +478 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/nvblas.h +824 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__init__.py +0 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-312.pyc +0 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_cupti/__init__.py +0 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-312.pyc +0 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-312.pyc +0 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +1141 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__init__.py +0 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-312.pyc +0 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__init__.py +0 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-312.pyc +0 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__init__.py +0 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-312.pyc +0 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +597 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1743 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h +452 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +95 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h +174 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h +99 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +212 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +693 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +345 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h +189 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h +136 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +160 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +424 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +320 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h +281 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h +63 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal/__init__.pyi ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Generic, TypeVar
2
+
3
+ from frozenlist import FrozenList
4
+
5
+ __all__ = ("Signal",)
6
+
7
+ _T = TypeVar("_T")
8
+
9
+ class Signal(FrozenList[_T], Generic[_T]):
10
+ def __init__(self, owner: Any) -> None: ...
11
+ def __repr__(self) -> str: ...
12
+ async def send(self, *args: Any, **kwargs: Any) -> None: ...
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/annotated_doc-0.0.4.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Sebastián Ramírez
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (340 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/__main__.cpython-312.pyc ADDED
Binary file (655 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__pycache__/core.cpython-312.pyc ADDED
Binary file (3.23 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (212 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (219 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (227 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas.h ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * This is the public header file for the CUBLAS library, defining the API
52
+ *
53
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
54
+ * on top of the CUDA runtime.
55
+ */
56
+
57
+ #if !defined(CUBLAS_H_)
58
+ #define CUBLAS_H_
59
+
60
+ #if defined(CUBLAS_V2_H_)
61
+ #error "It is an error to include both cublas.h and cublas_v2.h"
62
+ #endif
63
+
64
+ #include <cuda_runtime.h>
65
+
66
+ #ifndef CUBLASWINAPI
67
+ #ifdef _WIN32
68
+ #define CUBLASWINAPI __stdcall
69
+ #else
70
+ #define CUBLASWINAPI
71
+ #endif
72
+ #endif
73
+
74
+ #undef CUBLASAPI
75
+ #ifdef __CUDACC__
76
+ #define CUBLASAPI __host__
77
+ #else
78
+ #define CUBLASAPI
79
+ #endif
80
+
81
+ #include "cublas_api.h"
82
+
83
+ #if defined(__cplusplus)
84
+ extern "C" {
85
+ #endif
86
+
87
+ /* CUBLAS data types */
88
+ #define cublasStatus cublasStatus_t
89
+
90
+ cublasStatus CUBLASWINAPI cublasInit(void);
91
+ cublasStatus CUBLASWINAPI cublasShutdown(void);
92
+ cublasStatus CUBLASWINAPI cublasGetError(void);
93
+
94
+ cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
95
+ cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
96
+
97
+ cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
98
+
99
+ cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
100
+
101
+ /* ---------------- CUBLAS BLAS1 functions ---------------- */
102
+ /* NRM2 */
103
+ float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
104
+ double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
105
+ float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
106
+ double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
107
+ /*------------------------------------------------------------------------*/
108
+ /* DOT */
109
+ float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
110
+ double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
111
+ cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
112
+ cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
113
+ cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
114
+ cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
115
+ /*------------------------------------------------------------------------*/
116
+ /* SCAL */
117
+ void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
118
+ void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
119
+ void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
120
+ void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
121
+
122
+ void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
123
+ void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
124
+ /*------------------------------------------------------------------------*/
125
+ /* AXPY */
126
+ void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
127
+ void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
128
+ void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
129
+ void CUBLASWINAPI
130
+ cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
131
+ /*------------------------------------------------------------------------*/
132
+ /* COPY */
133
+ void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
134
+ void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
135
+ void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
136
+ void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
137
+ /*------------------------------------------------------------------------*/
138
+ /* SWAP */
139
+ void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
140
+ void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
141
+ void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
142
+ void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
143
+ /*------------------------------------------------------------------------*/
144
+ /* AMAX */
145
+ int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
146
+ int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
147
+ int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
148
+ int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
149
+ /*------------------------------------------------------------------------*/
150
+ /* AMIN */
151
+ int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
152
+ int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
153
+
154
+ int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
155
+ int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
156
+ /*------------------------------------------------------------------------*/
157
+ /* ASUM */
158
+ float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
159
+ double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
160
+ float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
161
+ double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
162
+ /*------------------------------------------------------------------------*/
163
+ /* ROT */
164
+ void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
165
+ void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
166
+ void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
167
+ void CUBLASWINAPI
168
+ cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
169
+ void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
170
+ void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
171
+ /*------------------------------------------------------------------------*/
172
+ /* ROTG */
173
+ void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
174
+ void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
175
+ void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
176
+ void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
177
+ /*------------------------------------------------------------------------*/
178
+ /* ROTM */
179
+ void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
180
+ void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
181
+ /*------------------------------------------------------------------------*/
182
+ /* ROTMG */
183
+ void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
184
+ void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
185
+
186
+ /* --------------- CUBLAS BLAS2 functions ---------------- */
187
+ /* GEMV */
188
+ void CUBLASWINAPI cublasSgemv(char trans,
189
+ int m,
190
+ int n,
191
+ float alpha,
192
+ const float* A,
193
+ int lda,
194
+ const float* x,
195
+ int incx,
196
+ float beta,
197
+ float* y,
198
+ int incy);
199
+ void CUBLASWINAPI cublasDgemv(char trans,
200
+ int m,
201
+ int n,
202
+ double alpha,
203
+ const double* A,
204
+ int lda,
205
+ const double* x,
206
+ int incx,
207
+ double beta,
208
+ double* y,
209
+ int incy);
210
+ void CUBLASWINAPI cublasCgemv(char trans,
211
+ int m,
212
+ int n,
213
+ cuComplex alpha,
214
+ const cuComplex* A,
215
+ int lda,
216
+ const cuComplex* x,
217
+ int incx,
218
+ cuComplex beta,
219
+ cuComplex* y,
220
+ int incy);
221
+ void CUBLASWINAPI cublasZgemv(char trans,
222
+ int m,
223
+ int n,
224
+ cuDoubleComplex alpha,
225
+ const cuDoubleComplex* A,
226
+ int lda,
227
+ const cuDoubleComplex* x,
228
+ int incx,
229
+ cuDoubleComplex beta,
230
+ cuDoubleComplex* y,
231
+ int incy);
232
+ /*------------------------------------------------------------------------*/
233
+ /* GBMV */
234
+ void CUBLASWINAPI cublasSgbmv(char trans,
235
+ int m,
236
+ int n,
237
+ int kl,
238
+ int ku,
239
+ float alpha,
240
+ const float* A,
241
+ int lda,
242
+ const float* x,
243
+ int incx,
244
+ float beta,
245
+ float* y,
246
+ int incy);
247
+ void CUBLASWINAPI cublasDgbmv(char trans,
248
+ int m,
249
+ int n,
250
+ int kl,
251
+ int ku,
252
+ double alpha,
253
+ const double* A,
254
+ int lda,
255
+ const double* x,
256
+ int incx,
257
+ double beta,
258
+ double* y,
259
+ int incy);
260
+ void CUBLASWINAPI cublasCgbmv(char trans,
261
+ int m,
262
+ int n,
263
+ int kl,
264
+ int ku,
265
+ cuComplex alpha,
266
+ const cuComplex* A,
267
+ int lda,
268
+ const cuComplex* x,
269
+ int incx,
270
+ cuComplex beta,
271
+ cuComplex* y,
272
+ int incy);
273
+ void CUBLASWINAPI cublasZgbmv(char trans,
274
+ int m,
275
+ int n,
276
+ int kl,
277
+ int ku,
278
+ cuDoubleComplex alpha,
279
+ const cuDoubleComplex* A,
280
+ int lda,
281
+ const cuDoubleComplex* x,
282
+ int incx,
283
+ cuDoubleComplex beta,
284
+ cuDoubleComplex* y,
285
+ int incy);
286
+ /*------------------------------------------------------------------------*/
287
+ /* TRMV */
288
+ void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
289
+ void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
290
+ void CUBLASWINAPI
291
+ cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
292
+ void CUBLASWINAPI
293
+ cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
294
+ /*------------------------------------------------------------------------*/
295
+ /* TBMV */
296
+ void CUBLASWINAPI
297
+ cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
298
+ void CUBLASWINAPI
299
+ cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
300
+ void CUBLASWINAPI
301
+ cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
302
+ void CUBLASWINAPI cublasZtbmv(
303
+ char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
304
+ /*------------------------------------------------------------------------*/
305
+ /* TPMV */
306
+ void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
307
+
308
+ void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
309
+
310
+ void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
311
+
312
+ void CUBLASWINAPI
313
+ cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
314
+ /*------------------------------------------------------------------------*/
315
+ /* TRSV */
316
+ void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
317
+
318
+ void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
319
+
320
+ void CUBLASWINAPI
321
+ cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
322
+
323
+ void CUBLASWINAPI
324
+ cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
325
+ /*------------------------------------------------------------------------*/
326
+ /* TPSV */
327
+ void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
328
+
329
+ void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
330
+
331
+ void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
332
+
333
+ void CUBLASWINAPI
334
+ cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
335
+ /*------------------------------------------------------------------------*/
336
+ /* TBSV */
337
+ void CUBLASWINAPI
338
+ cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
339
+
340
+ void CUBLASWINAPI
341
+ cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
342
+ void CUBLASWINAPI
343
+ cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
344
+
345
+ void CUBLASWINAPI cublasZtbsv(
346
+ char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
347
+ /*------------------------------------------------------------------------*/
348
+ /* SYMV/HEMV */
349
+ void CUBLASWINAPI cublasSsymv(
350
+ char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
351
+ void CUBLASWINAPI cublasDsymv(char uplo,
352
+ int n,
353
+ double alpha,
354
+ const double* A,
355
+ int lda,
356
+ const double* x,
357
+ int incx,
358
+ double beta,
359
+ double* y,
360
+ int incy);
361
+ void CUBLASWINAPI cublasChemv(char uplo,
362
+ int n,
363
+ cuComplex alpha,
364
+ const cuComplex* A,
365
+ int lda,
366
+ const cuComplex* x,
367
+ int incx,
368
+ cuComplex beta,
369
+ cuComplex* y,
370
+ int incy);
371
+ void CUBLASWINAPI cublasZhemv(char uplo,
372
+ int n,
373
+ cuDoubleComplex alpha,
374
+ const cuDoubleComplex* A,
375
+ int lda,
376
+ const cuDoubleComplex* x,
377
+ int incx,
378
+ cuDoubleComplex beta,
379
+ cuDoubleComplex* y,
380
+ int incy);
381
+ /*------------------------------------------------------------------------*/
382
+ /* SBMV/HBMV */
383
+ void CUBLASWINAPI cublasSsbmv(char uplo,
384
+ int n,
385
+ int k,
386
+ float alpha,
387
+ const float* A,
388
+ int lda,
389
+ const float* x,
390
+ int incx,
391
+ float beta,
392
+ float* y,
393
+ int incy);
394
+ void CUBLASWINAPI cublasDsbmv(char uplo,
395
+ int n,
396
+ int k,
397
+ double alpha,
398
+ const double* A,
399
+ int lda,
400
+ const double* x,
401
+ int incx,
402
+ double beta,
403
+ double* y,
404
+ int incy);
405
+ void CUBLASWINAPI cublasChbmv(char uplo,
406
+ int n,
407
+ int k,
408
+ cuComplex alpha,
409
+ const cuComplex* A,
410
+ int lda,
411
+ const cuComplex* x,
412
+ int incx,
413
+ cuComplex beta,
414
+ cuComplex* y,
415
+ int incy);
416
+ void CUBLASWINAPI cublasZhbmv(char uplo,
417
+ int n,
418
+ int k,
419
+ cuDoubleComplex alpha,
420
+ const cuDoubleComplex* A,
421
+ int lda,
422
+ const cuDoubleComplex* x,
423
+ int incx,
424
+ cuDoubleComplex beta,
425
+ cuDoubleComplex* y,
426
+ int incy);
427
+ /*------------------------------------------------------------------------*/
428
+ /* SPMV/HPMV */
429
+ void CUBLASWINAPI
430
+ cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
431
+ void CUBLASWINAPI cublasDspmv(
432
+ char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
433
+ void CUBLASWINAPI cublasChpmv(char uplo,
434
+ int n,
435
+ cuComplex alpha,
436
+ const cuComplex* AP,
437
+ const cuComplex* x,
438
+ int incx,
439
+ cuComplex beta,
440
+ cuComplex* y,
441
+ int incy);
442
+ void CUBLASWINAPI cublasZhpmv(char uplo,
443
+ int n,
444
+ cuDoubleComplex alpha,
445
+ const cuDoubleComplex* AP,
446
+ const cuDoubleComplex* x,
447
+ int incx,
448
+ cuDoubleComplex beta,
449
+ cuDoubleComplex* y,
450
+ int incy);
451
+
452
+ /*------------------------------------------------------------------------*/
453
+ /* GER */
454
+ void CUBLASWINAPI
455
+ cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
456
+ void CUBLASWINAPI
457
+ cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
458
+
459
+ void CUBLASWINAPI cublasCgeru(
460
+ int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
461
+ void CUBLASWINAPI cublasCgerc(
462
+ int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
463
+ void CUBLASWINAPI cublasZgeru(int m,
464
+ int n,
465
+ cuDoubleComplex alpha,
466
+ const cuDoubleComplex* x,
467
+ int incx,
468
+ const cuDoubleComplex* y,
469
+ int incy,
470
+ cuDoubleComplex* A,
471
+ int lda);
472
+ void CUBLASWINAPI cublasZgerc(int m,
473
+ int n,
474
+ cuDoubleComplex alpha,
475
+ const cuDoubleComplex* x,
476
+ int incx,
477
+ const cuDoubleComplex* y,
478
+ int incy,
479
+ cuDoubleComplex* A,
480
+ int lda);
481
+ /*------------------------------------------------------------------------*/
482
+ /* SYR/HER */
483
+ void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
484
+ void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
485
+
486
+ void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
487
+ void CUBLASWINAPI
488
+ cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
489
+
490
+ /*------------------------------------------------------------------------*/
491
+ /* SPR/HPR */
492
+ void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
493
+ void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
494
+ void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
495
+ void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
496
+ /*------------------------------------------------------------------------*/
497
+ /* SYR2/HER2 */
498
+ void CUBLASWINAPI
499
+ cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
500
+ void CUBLASWINAPI
501
+ cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
502
+ void CUBLASWINAPI cublasCher2(char uplo,
503
+ int n,
504
+ cuComplex alpha,
505
+ const cuComplex* x,
506
+ int incx,
507
+ const cuComplex* y,
508
+ int incy,
509
+ cuComplex* A,
510
+ int lda);
511
+ void CUBLASWINAPI cublasZher2(char uplo,
512
+ int n,
513
+ cuDoubleComplex alpha,
514
+ const cuDoubleComplex* x,
515
+ int incx,
516
+ const cuDoubleComplex* y,
517
+ int incy,
518
+ cuDoubleComplex* A,
519
+ int lda);
520
+
521
+ /*------------------------------------------------------------------------*/
522
+ /* SPR2/HPR2 */
523
+ void CUBLASWINAPI
524
+ cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
525
+ void CUBLASWINAPI
526
+ cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
527
+ void CUBLASWINAPI cublasChpr2(
528
+ char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
529
+ void CUBLASWINAPI cublasZhpr2(char uplo,
530
+ int n,
531
+ cuDoubleComplex alpha,
532
+ const cuDoubleComplex* x,
533
+ int incx,
534
+ const cuDoubleComplex* y,
535
+ int incy,
536
+ cuDoubleComplex* AP);
537
+ /* ------------------------BLAS3 Functions ------------------------------- */
538
+ /* GEMM */
539
+ void CUBLASWINAPI cublasSgemm(char transa,
540
+ char transb,
541
+ int m,
542
+ int n,
543
+ int k,
544
+ float alpha,
545
+ const float* A,
546
+ int lda,
547
+ const float* B,
548
+ int ldb,
549
+ float beta,
550
+ float* C,
551
+ int ldc);
552
+ void CUBLASWINAPI cublasDgemm(char transa,
553
+ char transb,
554
+ int m,
555
+ int n,
556
+ int k,
557
+ double alpha,
558
+ const double* A,
559
+ int lda,
560
+ const double* B,
561
+ int ldb,
562
+ double beta,
563
+ double* C,
564
+ int ldc);
565
+ void CUBLASWINAPI cublasCgemm(char transa,
566
+ char transb,
567
+ int m,
568
+ int n,
569
+ int k,
570
+ cuComplex alpha,
571
+ const cuComplex* A,
572
+ int lda,
573
+ const cuComplex* B,
574
+ int ldb,
575
+ cuComplex beta,
576
+ cuComplex* C,
577
+ int ldc);
578
+ void CUBLASWINAPI cublasZgemm(char transa,
579
+ char transb,
580
+ int m,
581
+ int n,
582
+ int k,
583
+ cuDoubleComplex alpha,
584
+ const cuDoubleComplex* A,
585
+ int lda,
586
+ const cuDoubleComplex* B,
587
+ int ldb,
588
+ cuDoubleComplex beta,
589
+ cuDoubleComplex* C,
590
+ int ldc);
591
+ /* -------------------------------------------------------*/
592
+ /* SYRK */
593
+ void CUBLASWINAPI
594
+ cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
595
+ void CUBLASWINAPI cublasDsyrk(
596
+ char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
597
+
598
+ void CUBLASWINAPI cublasCsyrk(char uplo,
599
+ char trans,
600
+ int n,
601
+ int k,
602
+ cuComplex alpha,
603
+ const cuComplex* A,
604
+ int lda,
605
+ cuComplex beta,
606
+ cuComplex* C,
607
+ int ldc);
608
+ void CUBLASWINAPI cublasZsyrk(char uplo,
609
+ char trans,
610
+ int n,
611
+ int k,
612
+ cuDoubleComplex alpha,
613
+ const cuDoubleComplex* A,
614
+ int lda,
615
+ cuDoubleComplex beta,
616
+ cuDoubleComplex* C,
617
+ int ldc);
618
+ /* ------------------------------------------------------- */
619
+ /* HERK */
620
+ void CUBLASWINAPI cublasCherk(
621
+ char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
622
+ void CUBLASWINAPI cublasZherk(char uplo,
623
+ char trans,
624
+ int n,
625
+ int k,
626
+ double alpha,
627
+ const cuDoubleComplex* A,
628
+ int lda,
629
+ double beta,
630
+ cuDoubleComplex* C,
631
+ int ldc);
632
+ /* ------------------------------------------------------- */
633
+ /* SYR2K */
634
+ void CUBLASWINAPI cublasSsyr2k(char uplo,
635
+ char trans,
636
+ int n,
637
+ int k,
638
+ float alpha,
639
+ const float* A,
640
+ int lda,
641
+ const float* B,
642
+ int ldb,
643
+ float beta,
644
+ float* C,
645
+ int ldc);
646
+
647
+ void CUBLASWINAPI cublasDsyr2k(char uplo,
648
+ char trans,
649
+ int n,
650
+ int k,
651
+ double alpha,
652
+ const double* A,
653
+ int lda,
654
+ const double* B,
655
+ int ldb,
656
+ double beta,
657
+ double* C,
658
+ int ldc);
659
+ void CUBLASWINAPI cublasCsyr2k(char uplo,
660
+ char trans,
661
+ int n,
662
+ int k,
663
+ cuComplex alpha,
664
+ const cuComplex* A,
665
+ int lda,
666
+ const cuComplex* B,
667
+ int ldb,
668
+ cuComplex beta,
669
+ cuComplex* C,
670
+ int ldc);
671
+
672
+ void CUBLASWINAPI cublasZsyr2k(char uplo,
673
+ char trans,
674
+ int n,
675
+ int k,
676
+ cuDoubleComplex alpha,
677
+ const cuDoubleComplex* A,
678
+ int lda,
679
+ const cuDoubleComplex* B,
680
+ int ldb,
681
+ cuDoubleComplex beta,
682
+ cuDoubleComplex* C,
683
+ int ldc);
684
+ /* ------------------------------------------------------- */
685
+ /* HER2K */
686
+ void CUBLASWINAPI cublasCher2k(char uplo,
687
+ char trans,
688
+ int n,
689
+ int k,
690
+ cuComplex alpha,
691
+ const cuComplex* A,
692
+ int lda,
693
+ const cuComplex* B,
694
+ int ldb,
695
+ float beta,
696
+ cuComplex* C,
697
+ int ldc);
698
+
699
+ void CUBLASWINAPI cublasZher2k(char uplo,
700
+ char trans,
701
+ int n,
702
+ int k,
703
+ cuDoubleComplex alpha,
704
+ const cuDoubleComplex* A,
705
+ int lda,
706
+ const cuDoubleComplex* B,
707
+ int ldb,
708
+ double beta,
709
+ cuDoubleComplex* C,
710
+ int ldc);
711
+
712
+ /*------------------------------------------------------------------------*/
713
+ /* SYMM*/
714
+ void CUBLASWINAPI cublasSsymm(char side,
715
+ char uplo,
716
+ int m,
717
+ int n,
718
+ float alpha,
719
+ const float* A,
720
+ int lda,
721
+ const float* B,
722
+ int ldb,
723
+ float beta,
724
+ float* C,
725
+ int ldc);
726
+ void CUBLASWINAPI cublasDsymm(char side,
727
+ char uplo,
728
+ int m,
729
+ int n,
730
+ double alpha,
731
+ const double* A,
732
+ int lda,
733
+ const double* B,
734
+ int ldb,
735
+ double beta,
736
+ double* C,
737
+ int ldc);
738
+
739
+ void CUBLASWINAPI cublasCsymm(char side,
740
+ char uplo,
741
+ int m,
742
+ int n,
743
+ cuComplex alpha,
744
+ const cuComplex* A,
745
+ int lda,
746
+ const cuComplex* B,
747
+ int ldb,
748
+ cuComplex beta,
749
+ cuComplex* C,
750
+ int ldc);
751
+
752
+ void CUBLASWINAPI cublasZsymm(char side,
753
+ char uplo,
754
+ int m,
755
+ int n,
756
+ cuDoubleComplex alpha,
757
+ const cuDoubleComplex* A,
758
+ int lda,
759
+ const cuDoubleComplex* B,
760
+ int ldb,
761
+ cuDoubleComplex beta,
762
+ cuDoubleComplex* C,
763
+ int ldc);
764
+ /*------------------------------------------------------------------------*/
765
+ /* HEMM*/
766
+ void CUBLASWINAPI cublasChemm(char side,
767
+ char uplo,
768
+ int m,
769
+ int n,
770
+ cuComplex alpha,
771
+ const cuComplex* A,
772
+ int lda,
773
+ const cuComplex* B,
774
+ int ldb,
775
+ cuComplex beta,
776
+ cuComplex* C,
777
+ int ldc);
778
+ void CUBLASWINAPI cublasZhemm(char side,
779
+ char uplo,
780
+ int m,
781
+ int n,
782
+ cuDoubleComplex alpha,
783
+ const cuDoubleComplex* A,
784
+ int lda,
785
+ const cuDoubleComplex* B,
786
+ int ldb,
787
+ cuDoubleComplex beta,
788
+ cuDoubleComplex* C,
789
+ int ldc);
790
+
791
+ /*------------------------------------------------------------------------*/
792
+ /* TRSM*/
793
+ void CUBLASWINAPI cublasStrsm(char side,
794
+ char uplo,
795
+ char transa,
796
+ char diag,
797
+ int m,
798
+ int n,
799
+ float alpha,
800
+ const float* A,
801
+ int lda,
802
+ float* B,
803
+ int ldb);
804
+
805
+ void CUBLASWINAPI cublasDtrsm(char side,
806
+ char uplo,
807
+ char transa,
808
+ char diag,
809
+ int m,
810
+ int n,
811
+ double alpha,
812
+ const double* A,
813
+ int lda,
814
+ double* B,
815
+ int ldb);
816
+
817
+ void CUBLASWINAPI cublasCtrsm(char side,
818
+ char uplo,
819
+ char transa,
820
+ char diag,
821
+ int m,
822
+ int n,
823
+ cuComplex alpha,
824
+ const cuComplex* A,
825
+ int lda,
826
+ cuComplex* B,
827
+ int ldb);
828
+
829
+ void CUBLASWINAPI cublasZtrsm(char side,
830
+ char uplo,
831
+ char transa,
832
+ char diag,
833
+ int m,
834
+ int n,
835
+ cuDoubleComplex alpha,
836
+ const cuDoubleComplex* A,
837
+ int lda,
838
+ cuDoubleComplex* B,
839
+ int ldb);
840
+ /*------------------------------------------------------------------------*/
841
+ /* TRMM*/
842
+ void CUBLASWINAPI cublasStrmm(char side,
843
+ char uplo,
844
+ char transa,
845
+ char diag,
846
+ int m,
847
+ int n,
848
+ float alpha,
849
+ const float* A,
850
+ int lda,
851
+ float* B,
852
+ int ldb);
853
+ void CUBLASWINAPI cublasDtrmm(char side,
854
+ char uplo,
855
+ char transa,
856
+ char diag,
857
+ int m,
858
+ int n,
859
+ double alpha,
860
+ const double* A,
861
+ int lda,
862
+ double* B,
863
+ int ldb);
864
+ void CUBLASWINAPI cublasCtrmm(char side,
865
+ char uplo,
866
+ char transa,
867
+ char diag,
868
+ int m,
869
+ int n,
870
+ cuComplex alpha,
871
+ const cuComplex* A,
872
+ int lda,
873
+ cuComplex* B,
874
+ int ldb);
875
+ void CUBLASWINAPI cublasZtrmm(char side,
876
+ char uplo,
877
+ char transa,
878
+ char diag,
879
+ int m,
880
+ int n,
881
+ cuDoubleComplex alpha,
882
+ const cuDoubleComplex* A,
883
+ int lda,
884
+ cuDoubleComplex* B,
885
+ int ldb);
886
+
887
+ #if defined(__cplusplus)
888
+ }
889
+ #endif /* __cplusplus */
890
+
891
+ #endif /* !defined(CUBLAS_H_) */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasLt.h ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublasXt.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
51
+
52
+ */
53
+
54
+ #if !defined(CUBLAS_XT_H_)
55
+ #define CUBLAS_XT_H_
56
+
57
+ #include "driver_types.h"
58
+ #include "cuComplex.h" /* import complex data type */
59
+
60
+ #include "cublas_v2.h"
61
+
62
+ #if defined(__cplusplus)
63
+ extern "C" {
64
+ #endif /* __cplusplus */
65
+
66
+ struct cublasXtContext;
67
+ typedef struct cublasXtContext* cublasXtHandle_t;
68
+
69
+ cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
70
+ cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
71
+ cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
72
+ cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
73
+ /* This routine selects the Gpus that the user want to use for CUBLAS-XT */
74
+ cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
75
+
76
+ /* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
77
+ cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
78
+ cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
79
+
80
+ typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
81
+ /* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
82
+ are not pinned : Pinning/Unpinning the Host memory is still a costly operation
83
+ It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
84
+ */
85
+ cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
86
+ cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
87
+
88
+ /* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
89
+ typedef enum {
90
+ CUBLASXT_FLOAT = 0,
91
+ CUBLASXT_DOUBLE = 1,
92
+ CUBLASXT_COMPLEX = 2,
93
+ CUBLASXT_DOUBLECOMPLEX = 3,
94
+ } cublasXtOpType_t;
95
+
96
+ typedef enum {
97
+ CUBLASXT_GEMM = 0,
98
+ CUBLASXT_SYRK = 1,
99
+ CUBLASXT_HERK = 2,
100
+ CUBLASXT_SYMM = 3,
101
+ CUBLASXT_HEMM = 4,
102
+ CUBLASXT_TRSM = 5,
103
+ CUBLASXT_SYR2K = 6,
104
+ CUBLASXT_HER2K = 7,
105
+
106
+ CUBLASXT_SPMM = 8,
107
+ CUBLASXT_SYRKX = 9,
108
+ CUBLASXT_HERKX = 10,
109
+ CUBLASXT_TRMM = 11,
110
+ CUBLASXT_ROUTINE_MAX = 12,
111
+ } cublasXtBlasOp_t;
112
+
113
+ /* Currently only 32-bit integer BLAS routines are supported */
114
+ cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
115
+ cublasXtBlasOp_t blasOp,
116
+ cublasXtOpType_t type,
117
+ void* blasFunctor);
118
+
119
+ /* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
120
+ cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
121
+ cublasXtBlasOp_t blasOp,
122
+ cublasXtOpType_t type,
123
+ float ratio);
124
+
125
+ /* GEMM */
126
+ cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
127
+ cublasOperation_t transa,
128
+ cublasOperation_t transb,
129
+ size_t m,
130
+ size_t n,
131
+ size_t k,
132
+ const float* alpha,
133
+ const float* A,
134
+ size_t lda,
135
+ const float* B,
136
+ size_t ldb,
137
+ const float* beta,
138
+ float* C,
139
+ size_t ldc);
140
+
141
+ cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
142
+ cublasOperation_t transa,
143
+ cublasOperation_t transb,
144
+ size_t m,
145
+ size_t n,
146
+ size_t k,
147
+ const double* alpha,
148
+ const double* A,
149
+ size_t lda,
150
+ const double* B,
151
+ size_t ldb,
152
+ const double* beta,
153
+ double* C,
154
+ size_t ldc);
155
+
156
+ cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
157
+ cublasOperation_t transa,
158
+ cublasOperation_t transb,
159
+ size_t m,
160
+ size_t n,
161
+ size_t k,
162
+ const cuComplex* alpha,
163
+ const cuComplex* A,
164
+ size_t lda,
165
+ const cuComplex* B,
166
+ size_t ldb,
167
+ const cuComplex* beta,
168
+ cuComplex* C,
169
+ size_t ldc);
170
+
171
+ cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
172
+ cublasOperation_t transa,
173
+ cublasOperation_t transb,
174
+ size_t m,
175
+ size_t n,
176
+ size_t k,
177
+ const cuDoubleComplex* alpha,
178
+ const cuDoubleComplex* A,
179
+ size_t lda,
180
+ const cuDoubleComplex* B,
181
+ size_t ldb,
182
+ const cuDoubleComplex* beta,
183
+ cuDoubleComplex* C,
184
+ size_t ldc);
185
+ /* ------------------------------------------------------- */
186
+ /* SYRK */
187
+ cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
188
+ cublasFillMode_t uplo,
189
+ cublasOperation_t trans,
190
+ size_t n,
191
+ size_t k,
192
+ const float* alpha,
193
+ const float* A,
194
+ size_t lda,
195
+ const float* beta,
196
+ float* C,
197
+ size_t ldc);
198
+
199
+ cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
200
+ cublasFillMode_t uplo,
201
+ cublasOperation_t trans,
202
+ size_t n,
203
+ size_t k,
204
+ const double* alpha,
205
+ const double* A,
206
+ size_t lda,
207
+ const double* beta,
208
+ double* C,
209
+ size_t ldc);
210
+
211
+ cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
212
+ cublasFillMode_t uplo,
213
+ cublasOperation_t trans,
214
+ size_t n,
215
+ size_t k,
216
+ const cuComplex* alpha,
217
+ const cuComplex* A,
218
+ size_t lda,
219
+ const cuComplex* beta,
220
+ cuComplex* C,
221
+ size_t ldc);
222
+
223
+ cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
224
+ cublasFillMode_t uplo,
225
+ cublasOperation_t trans,
226
+ size_t n,
227
+ size_t k,
228
+ const cuDoubleComplex* alpha,
229
+ const cuDoubleComplex* A,
230
+ size_t lda,
231
+ const cuDoubleComplex* beta,
232
+ cuDoubleComplex* C,
233
+ size_t ldc);
234
+ /* -------------------------------------------------------------------- */
235
+ /* HERK */
236
+ cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
237
+ cublasFillMode_t uplo,
238
+ cublasOperation_t trans,
239
+ size_t n,
240
+ size_t k,
241
+ const float* alpha,
242
+ const cuComplex* A,
243
+ size_t lda,
244
+ const float* beta,
245
+ cuComplex* C,
246
+ size_t ldc);
247
+
248
+ cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
249
+ cublasFillMode_t uplo,
250
+ cublasOperation_t trans,
251
+ size_t n,
252
+ size_t k,
253
+ const double* alpha,
254
+ const cuDoubleComplex* A,
255
+ size_t lda,
256
+ const double* beta,
257
+ cuDoubleComplex* C,
258
+ size_t ldc);
259
+ /* -------------------------------------------------------------------- */
260
+ /* SYR2K */
261
+ cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
262
+ cublasFillMode_t uplo,
263
+ cublasOperation_t trans,
264
+ size_t n,
265
+ size_t k,
266
+ const float* alpha,
267
+ const float* A,
268
+ size_t lda,
269
+ const float* B,
270
+ size_t ldb,
271
+ const float* beta,
272
+ float* C,
273
+ size_t ldc);
274
+
275
+ cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
276
+ cublasFillMode_t uplo,
277
+ cublasOperation_t trans,
278
+ size_t n,
279
+ size_t k,
280
+ const double* alpha,
281
+ const double* A,
282
+ size_t lda,
283
+ const double* B,
284
+ size_t ldb,
285
+ const double* beta,
286
+ double* C,
287
+ size_t ldc);
288
+
289
+ cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
290
+ cublasFillMode_t uplo,
291
+ cublasOperation_t trans,
292
+ size_t n,
293
+ size_t k,
294
+ const cuComplex* alpha,
295
+ const cuComplex* A,
296
+ size_t lda,
297
+ const cuComplex* B,
298
+ size_t ldb,
299
+ const cuComplex* beta,
300
+ cuComplex* C,
301
+ size_t ldc);
302
+
303
+ cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
304
+ cublasFillMode_t uplo,
305
+ cublasOperation_t trans,
306
+ size_t n,
307
+ size_t k,
308
+ const cuDoubleComplex* alpha,
309
+ const cuDoubleComplex* A,
310
+ size_t lda,
311
+ const cuDoubleComplex* B,
312
+ size_t ldb,
313
+ const cuDoubleComplex* beta,
314
+ cuDoubleComplex* C,
315
+ size_t ldc);
316
+ /* -------------------------------------------------------------------- */
317
+ /* HERKX : variant extension of HERK */
318
+ cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
319
+ cublasFillMode_t uplo,
320
+ cublasOperation_t trans,
321
+ size_t n,
322
+ size_t k,
323
+ const cuComplex* alpha,
324
+ const cuComplex* A,
325
+ size_t lda,
326
+ const cuComplex* B,
327
+ size_t ldb,
328
+ const float* beta,
329
+ cuComplex* C,
330
+ size_t ldc);
331
+
332
+ cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
333
+ cublasFillMode_t uplo,
334
+ cublasOperation_t trans,
335
+ size_t n,
336
+ size_t k,
337
+ const cuDoubleComplex* alpha,
338
+ const cuDoubleComplex* A,
339
+ size_t lda,
340
+ const cuDoubleComplex* B,
341
+ size_t ldb,
342
+ const double* beta,
343
+ cuDoubleComplex* C,
344
+ size_t ldc);
345
+
346
+ /* -------------------------------------------------------------------- */
347
+ /* TRSM */
348
+ cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
349
+ cublasSideMode_t side,
350
+ cublasFillMode_t uplo,
351
+ cublasOperation_t trans,
352
+ cublasDiagType_t diag,
353
+ size_t m,
354
+ size_t n,
355
+ const float* alpha,
356
+ const float* A,
357
+ size_t lda,
358
+ float* B,
359
+ size_t ldb);
360
+
361
+ cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
362
+ cublasSideMode_t side,
363
+ cublasFillMode_t uplo,
364
+ cublasOperation_t trans,
365
+ cublasDiagType_t diag,
366
+ size_t m,
367
+ size_t n,
368
+ const double* alpha,
369
+ const double* A,
370
+ size_t lda,
371
+ double* B,
372
+ size_t ldb);
373
+
374
+ cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
375
+ cublasSideMode_t side,
376
+ cublasFillMode_t uplo,
377
+ cublasOperation_t trans,
378
+ cublasDiagType_t diag,
379
+ size_t m,
380
+ size_t n,
381
+ const cuComplex* alpha,
382
+ const cuComplex* A,
383
+ size_t lda,
384
+ cuComplex* B,
385
+ size_t ldb);
386
+
387
+ cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
388
+ cublasSideMode_t side,
389
+ cublasFillMode_t uplo,
390
+ cublasOperation_t trans,
391
+ cublasDiagType_t diag,
392
+ size_t m,
393
+ size_t n,
394
+ const cuDoubleComplex* alpha,
395
+ const cuDoubleComplex* A,
396
+ size_t lda,
397
+ cuDoubleComplex* B,
398
+ size_t ldb);
399
+ /* -------------------------------------------------------------------- */
400
+ /* SYMM : Symmetric Multiply Matrix*/
401
+ cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
402
+ cublasSideMode_t side,
403
+ cublasFillMode_t uplo,
404
+ size_t m,
405
+ size_t n,
406
+ const float* alpha,
407
+ const float* A,
408
+ size_t lda,
409
+ const float* B,
410
+ size_t ldb,
411
+ const float* beta,
412
+ float* C,
413
+ size_t ldc);
414
+
415
+ cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
416
+ cublasSideMode_t side,
417
+ cublasFillMode_t uplo,
418
+ size_t m,
419
+ size_t n,
420
+ const double* alpha,
421
+ const double* A,
422
+ size_t lda,
423
+ const double* B,
424
+ size_t ldb,
425
+ const double* beta,
426
+ double* C,
427
+ size_t ldc);
428
+
429
+ cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
430
+ cublasSideMode_t side,
431
+ cublasFillMode_t uplo,
432
+ size_t m,
433
+ size_t n,
434
+ const cuComplex* alpha,
435
+ const cuComplex* A,
436
+ size_t lda,
437
+ const cuComplex* B,
438
+ size_t ldb,
439
+ const cuComplex* beta,
440
+ cuComplex* C,
441
+ size_t ldc);
442
+
443
+ cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
444
+ cublasSideMode_t side,
445
+ cublasFillMode_t uplo,
446
+ size_t m,
447
+ size_t n,
448
+ const cuDoubleComplex* alpha,
449
+ const cuDoubleComplex* A,
450
+ size_t lda,
451
+ const cuDoubleComplex* B,
452
+ size_t ldb,
453
+ const cuDoubleComplex* beta,
454
+ cuDoubleComplex* C,
455
+ size_t ldc);
456
+ /* -------------------------------------------------------------------- */
457
+ /* HEMM : Hermitian Matrix Multiply */
458
+ cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
459
+ cublasSideMode_t side,
460
+ cublasFillMode_t uplo,
461
+ size_t m,
462
+ size_t n,
463
+ const cuComplex* alpha,
464
+ const cuComplex* A,
465
+ size_t lda,
466
+ const cuComplex* B,
467
+ size_t ldb,
468
+ const cuComplex* beta,
469
+ cuComplex* C,
470
+ size_t ldc);
471
+
472
+ cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
473
+ cublasSideMode_t side,
474
+ cublasFillMode_t uplo,
475
+ size_t m,
476
+ size_t n,
477
+ const cuDoubleComplex* alpha,
478
+ const cuDoubleComplex* A,
479
+ size_t lda,
480
+ const cuDoubleComplex* B,
481
+ size_t ldb,
482
+ const cuDoubleComplex* beta,
483
+ cuDoubleComplex* C,
484
+ size_t ldc);
485
+
486
+ /* -------------------------------------------------------------------- */
487
+ /* SYRKX : variant extension of SYRK */
488
+ cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
489
+ cublasFillMode_t uplo,
490
+ cublasOperation_t trans,
491
+ size_t n,
492
+ size_t k,
493
+ const float* alpha,
494
+ const float* A,
495
+ size_t lda,
496
+ const float* B,
497
+ size_t ldb,
498
+ const float* beta,
499
+ float* C,
500
+ size_t ldc);
501
+
502
+ cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
503
+ cublasFillMode_t uplo,
504
+ cublasOperation_t trans,
505
+ size_t n,
506
+ size_t k,
507
+ const double* alpha,
508
+ const double* A,
509
+ size_t lda,
510
+ const double* B,
511
+ size_t ldb,
512
+ const double* beta,
513
+ double* C,
514
+ size_t ldc);
515
+
516
+ cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
517
+ cublasFillMode_t uplo,
518
+ cublasOperation_t trans,
519
+ size_t n,
520
+ size_t k,
521
+ const cuComplex* alpha,
522
+ const cuComplex* A,
523
+ size_t lda,
524
+ const cuComplex* B,
525
+ size_t ldb,
526
+ const cuComplex* beta,
527
+ cuComplex* C,
528
+ size_t ldc);
529
+
530
+ cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
531
+ cublasFillMode_t uplo,
532
+ cublasOperation_t trans,
533
+ size_t n,
534
+ size_t k,
535
+ const cuDoubleComplex* alpha,
536
+ const cuDoubleComplex* A,
537
+ size_t lda,
538
+ const cuDoubleComplex* B,
539
+ size_t ldb,
540
+ const cuDoubleComplex* beta,
541
+ cuDoubleComplex* C,
542
+ size_t ldc);
543
+ /* -------------------------------------------------------------------- */
544
+ /* HER2K : variant extension of HERK */
545
+ cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
546
+ cublasFillMode_t uplo,
547
+ cublasOperation_t trans,
548
+ size_t n,
549
+ size_t k,
550
+ const cuComplex* alpha,
551
+ const cuComplex* A,
552
+ size_t lda,
553
+ const cuComplex* B,
554
+ size_t ldb,
555
+ const float* beta,
556
+ cuComplex* C,
557
+ size_t ldc);
558
+
559
+ cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
560
+ cublasFillMode_t uplo,
561
+ cublasOperation_t trans,
562
+ size_t n,
563
+ size_t k,
564
+ const cuDoubleComplex* alpha,
565
+ const cuDoubleComplex* A,
566
+ size_t lda,
567
+ const cuDoubleComplex* B,
568
+ size_t ldb,
569
+ const double* beta,
570
+ cuDoubleComplex* C,
571
+ size_t ldc);
572
+
573
+ /* -------------------------------------------------------------------- */
574
+ /* SPMM : Symmetric Packed Multiply Matrix*/
575
+ cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
576
+ cublasSideMode_t side,
577
+ cublasFillMode_t uplo,
578
+ size_t m,
579
+ size_t n,
580
+ const float* alpha,
581
+ const float* AP,
582
+ const float* B,
583
+ size_t ldb,
584
+ const float* beta,
585
+ float* C,
586
+ size_t ldc);
587
+
588
+ cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
589
+ cublasSideMode_t side,
590
+ cublasFillMode_t uplo,
591
+ size_t m,
592
+ size_t n,
593
+ const double* alpha,
594
+ const double* AP,
595
+ const double* B,
596
+ size_t ldb,
597
+ const double* beta,
598
+ double* C,
599
+ size_t ldc);
600
+
601
+ cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
602
+ cublasSideMode_t side,
603
+ cublasFillMode_t uplo,
604
+ size_t m,
605
+ size_t n,
606
+ const cuComplex* alpha,
607
+ const cuComplex* AP,
608
+ const cuComplex* B,
609
+ size_t ldb,
610
+ const cuComplex* beta,
611
+ cuComplex* C,
612
+ size_t ldc);
613
+
614
+ cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
615
+ cublasSideMode_t side,
616
+ cublasFillMode_t uplo,
617
+ size_t m,
618
+ size_t n,
619
+ const cuDoubleComplex* alpha,
620
+ const cuDoubleComplex* AP,
621
+ const cuDoubleComplex* B,
622
+ size_t ldb,
623
+ const cuDoubleComplex* beta,
624
+ cuDoubleComplex* C,
625
+ size_t ldc);
626
+
627
+ /* -------------------------------------------------------------------- */
628
+ /* TRMM */
629
+ cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
630
+ cublasSideMode_t side,
631
+ cublasFillMode_t uplo,
632
+ cublasOperation_t trans,
633
+ cublasDiagType_t diag,
634
+ size_t m,
635
+ size_t n,
636
+ const float* alpha,
637
+ const float* A,
638
+ size_t lda,
639
+ const float* B,
640
+ size_t ldb,
641
+ float* C,
642
+ size_t ldc);
643
+
644
+ cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
645
+ cublasSideMode_t side,
646
+ cublasFillMode_t uplo,
647
+ cublasOperation_t trans,
648
+ cublasDiagType_t diag,
649
+ size_t m,
650
+ size_t n,
651
+ const double* alpha,
652
+ const double* A,
653
+ size_t lda,
654
+ const double* B,
655
+ size_t ldb,
656
+ double* C,
657
+ size_t ldc);
658
+
659
+ cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
660
+ cublasSideMode_t side,
661
+ cublasFillMode_t uplo,
662
+ cublasOperation_t trans,
663
+ cublasDiagType_t diag,
664
+ size_t m,
665
+ size_t n,
666
+ const cuComplex* alpha,
667
+ const cuComplex* A,
668
+ size_t lda,
669
+ const cuComplex* B,
670
+ size_t ldb,
671
+ cuComplex* C,
672
+ size_t ldc);
673
+
674
+ cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
675
+ cublasSideMode_t side,
676
+ cublasFillMode_t uplo,
677
+ cublasOperation_t trans,
678
+ cublasDiagType_t diag,
679
+ size_t m,
680
+ size_t n,
681
+ const cuDoubleComplex* alpha,
682
+ const cuDoubleComplex* A,
683
+ size_t lda,
684
+ const cuDoubleComplex* B,
685
+ size_t ldb,
686
+ cuDoubleComplex* C,
687
+ size_t ldc);
688
+
689
+ #if defined(__cplusplus)
690
+ }
691
+ #endif /* __cplusplus */
692
+
693
+ #endif /* !defined(CUBLAS_XT_H_) */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_api.h ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/cublas_v2.h ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * This is the public header file for the new CUBLAS library API, it mapped the generic
52
+ * Cublas name functions to the actual _v2 implementations.
53
+ */
54
+
55
+ #if !defined(CUBLAS_V2_H_)
56
+ #define CUBLAS_V2_H_
57
+
58
+ #if defined(CUBLAS_H_)
59
+ #error "It is an error to include both cublas.h and cublas_v2.h"
60
+ #endif
61
+
62
+ #undef CUBLASAPI
63
+ #ifdef __CUDACC__
64
+ #define CUBLASAPI __host__ __device__
65
+ #else
66
+ #define CUBLASAPI
67
+ #endif
68
+
69
+ #include "cublas_api.h"
70
+
71
+ #define cublasCreate cublasCreate_v2
72
+ #define cublasDestroy cublasDestroy_v2
73
+ #define cublasGetVersion cublasGetVersion_v2
74
+ #define cublasSetWorkspace cublasSetWorkspace_v2
75
+ #define cublasSetStream cublasSetStream_v2
76
+ #define cublasGetStream cublasGetStream_v2
77
+ #define cublasGetPointerMode cublasGetPointerMode_v2
78
+ #define cublasSetPointerMode cublasSetPointerMode_v2
79
+
80
+ /* 32-bit integer */
81
+
82
+ /* Blas1 Routines */
83
+
84
+ #define cublasSnrm2 cublasSnrm2_v2
85
+ #define cublasDnrm2 cublasDnrm2_v2
86
+ #define cublasScnrm2 cublasScnrm2_v2
87
+ #define cublasDznrm2 cublasDznrm2_v2
88
+
89
+ #define cublasSdot cublasSdot_v2
90
+ #define cublasDdot cublasDdot_v2
91
+ #define cublasCdotu cublasCdotu_v2
92
+ #define cublasCdotc cublasCdotc_v2
93
+ #define cublasZdotu cublasZdotu_v2
94
+ #define cublasZdotc cublasZdotc_v2
95
+
96
+ #define cublasSscal cublasSscal_v2
97
+ #define cublasDscal cublasDscal_v2
98
+ #define cublasCscal cublasCscal_v2
99
+ #define cublasCsscal cublasCsscal_v2
100
+ #define cublasZscal cublasZscal_v2
101
+ #define cublasZdscal cublasZdscal_v2
102
+
103
+ #define cublasSaxpy cublasSaxpy_v2
104
+ #define cublasDaxpy cublasDaxpy_v2
105
+ #define cublasCaxpy cublasCaxpy_v2
106
+ #define cublasZaxpy cublasZaxpy_v2
107
+
108
+ #define cublasScopy cublasScopy_v2
109
+ #define cublasDcopy cublasDcopy_v2
110
+ #define cublasCcopy cublasCcopy_v2
111
+ #define cublasZcopy cublasZcopy_v2
112
+
113
+ #define cublasSswap cublasSswap_v2
114
+ #define cublasDswap cublasDswap_v2
115
+ #define cublasCswap cublasCswap_v2
116
+ #define cublasZswap cublasZswap_v2
117
+
118
+ #define cublasIsamax cublasIsamax_v2
119
+ #define cublasIdamax cublasIdamax_v2
120
+ #define cublasIcamax cublasIcamax_v2
121
+ #define cublasIzamax cublasIzamax_v2
122
+
123
+ #define cublasIsamin cublasIsamin_v2
124
+ #define cublasIdamin cublasIdamin_v2
125
+ #define cublasIcamin cublasIcamin_v2
126
+ #define cublasIzamin cublasIzamin_v2
127
+
128
+ #define cublasSasum cublasSasum_v2
129
+ #define cublasDasum cublasDasum_v2
130
+ #define cublasScasum cublasScasum_v2
131
+ #define cublasDzasum cublasDzasum_v2
132
+
133
+ #define cublasSrot cublasSrot_v2
134
+ #define cublasDrot cublasDrot_v2
135
+ #define cublasCrot cublasCrot_v2
136
+ #define cublasCsrot cublasCsrot_v2
137
+ #define cublasZrot cublasZrot_v2
138
+ #define cublasZdrot cublasZdrot_v2
139
+
140
+ #define cublasSrotg cublasSrotg_v2
141
+ #define cublasDrotg cublasDrotg_v2
142
+ #define cublasCrotg cublasCrotg_v2
143
+ #define cublasZrotg cublasZrotg_v2
144
+
145
+ #define cublasSrotm cublasSrotm_v2
146
+ #define cublasDrotm cublasDrotm_v2
147
+
148
+ #define cublasSrotmg cublasSrotmg_v2
149
+ #define cublasDrotmg cublasDrotmg_v2
150
+
151
+ /* Blas2 Routines */
152
+
153
+ #define cublasSgemv cublasSgemv_v2
154
+ #define cublasDgemv cublasDgemv_v2
155
+ #define cublasCgemv cublasCgemv_v2
156
+ #define cublasZgemv cublasZgemv_v2
157
+
158
+ #define cublasSgbmv cublasSgbmv_v2
159
+ #define cublasDgbmv cublasDgbmv_v2
160
+ #define cublasCgbmv cublasCgbmv_v2
161
+ #define cublasZgbmv cublasZgbmv_v2
162
+
163
+ #define cublasStrmv cublasStrmv_v2
164
+ #define cublasDtrmv cublasDtrmv_v2
165
+ #define cublasCtrmv cublasCtrmv_v2
166
+ #define cublasZtrmv cublasZtrmv_v2
167
+
168
+ #define cublasStbmv cublasStbmv_v2
169
+ #define cublasDtbmv cublasDtbmv_v2
170
+ #define cublasCtbmv cublasCtbmv_v2
171
+ #define cublasZtbmv cublasZtbmv_v2
172
+
173
+ #define cublasStpmv cublasStpmv_v2
174
+ #define cublasDtpmv cublasDtpmv_v2
175
+ #define cublasCtpmv cublasCtpmv_v2
176
+ #define cublasZtpmv cublasZtpmv_v2
177
+
178
+ #define cublasStrsv cublasStrsv_v2
179
+ #define cublasDtrsv cublasDtrsv_v2
180
+ #define cublasCtrsv cublasCtrsv_v2
181
+ #define cublasZtrsv cublasZtrsv_v2
182
+
183
+ #define cublasStpsv cublasStpsv_v2
184
+ #define cublasDtpsv cublasDtpsv_v2
185
+ #define cublasCtpsv cublasCtpsv_v2
186
+ #define cublasZtpsv cublasZtpsv_v2
187
+
188
+ #define cublasStbsv cublasStbsv_v2
189
+ #define cublasDtbsv cublasDtbsv_v2
190
+ #define cublasCtbsv cublasCtbsv_v2
191
+ #define cublasZtbsv cublasZtbsv_v2
192
+
193
+ #define cublasSsymv cublasSsymv_v2
194
+ #define cublasDsymv cublasDsymv_v2
195
+ #define cublasCsymv cublasCsymv_v2
196
+ #define cublasZsymv cublasZsymv_v2
197
+ #define cublasChemv cublasChemv_v2
198
+ #define cublasZhemv cublasZhemv_v2
199
+
200
+ #define cublasSsbmv cublasSsbmv_v2
201
+ #define cublasDsbmv cublasDsbmv_v2
202
+ #define cublasChbmv cublasChbmv_v2
203
+ #define cublasZhbmv cublasZhbmv_v2
204
+
205
+ #define cublasSspmv cublasSspmv_v2
206
+ #define cublasDspmv cublasDspmv_v2
207
+ #define cublasChpmv cublasChpmv_v2
208
+ #define cublasZhpmv cublasZhpmv_v2
209
+
210
+ #define cublasSger cublasSger_v2
211
+ #define cublasDger cublasDger_v2
212
+ #define cublasCgeru cublasCgeru_v2
213
+ #define cublasCgerc cublasCgerc_v2
214
+ #define cublasZgeru cublasZgeru_v2
215
+ #define cublasZgerc cublasZgerc_v2
216
+
217
+ #define cublasSsyr cublasSsyr_v2
218
+ #define cublasDsyr cublasDsyr_v2
219
+ #define cublasCsyr cublasCsyr_v2
220
+ #define cublasZsyr cublasZsyr_v2
221
+ #define cublasCher cublasCher_v2
222
+ #define cublasZher cublasZher_v2
223
+
224
+ #define cublasSspr cublasSspr_v2
225
+ #define cublasDspr cublasDspr_v2
226
+ #define cublasChpr cublasChpr_v2
227
+ #define cublasZhpr cublasZhpr_v2
228
+
229
+ #define cublasSsyr2 cublasSsyr2_v2
230
+ #define cublasDsyr2 cublasDsyr2_v2
231
+ #define cublasCsyr2 cublasCsyr2_v2
232
+ #define cublasZsyr2 cublasZsyr2_v2
233
+ #define cublasCher2 cublasCher2_v2
234
+ #define cublasZher2 cublasZher2_v2
235
+
236
+ #define cublasSspr2 cublasSspr2_v2
237
+ #define cublasDspr2 cublasDspr2_v2
238
+ #define cublasChpr2 cublasChpr2_v2
239
+ #define cublasZhpr2 cublasZhpr2_v2
240
+
241
+ /* Blas3 Routines */
242
+
243
+ #define cublasSgemm cublasSgemm_v2
244
+ #define cublasDgemm cublasDgemm_v2
245
+ #define cublasCgemm cublasCgemm_v2
246
+ #define cublasZgemm cublasZgemm_v2
247
+
248
+ #define cublasSsyrk cublasSsyrk_v2
249
+ #define cublasDsyrk cublasDsyrk_v2
250
+ #define cublasCsyrk cublasCsyrk_v2
251
+ #define cublasZsyrk cublasZsyrk_v2
252
+ #define cublasCherk cublasCherk_v2
253
+ #define cublasZherk cublasZherk_v2
254
+
255
+ #define cublasSsyr2k cublasSsyr2k_v2
256
+ #define cublasDsyr2k cublasDsyr2k_v2
257
+ #define cublasCsyr2k cublasCsyr2k_v2
258
+ #define cublasZsyr2k cublasZsyr2k_v2
259
+ #define cublasCher2k cublasCher2k_v2
260
+ #define cublasZher2k cublasZher2k_v2
261
+
262
+ #define cublasSsymm cublasSsymm_v2
263
+ #define cublasDsymm cublasDsymm_v2
264
+ #define cublasCsymm cublasCsymm_v2
265
+ #define cublasZsymm cublasZsymm_v2
266
+ #define cublasChemm cublasChemm_v2
267
+ #define cublasZhemm cublasZhemm_v2
268
+
269
+ #define cublasStrsm cublasStrsm_v2
270
+ #define cublasDtrsm cublasDtrsm_v2
271
+ #define cublasCtrsm cublasCtrsm_v2
272
+ #define cublasZtrsm cublasZtrsm_v2
273
+
274
+ #define cublasStrmm cublasStrmm_v2
275
+ #define cublasDtrmm cublasDtrmm_v2
276
+ #define cublasCtrmm cublasCtrmm_v2
277
+ #define cublasZtrmm cublasZtrmm_v2
278
+
279
+ /* 64-bit integer */
280
+
281
+ /* Blas1 Routines */
282
+
283
+ #define cublasSnrm2_64 cublasSnrm2_v2_64
284
+ #define cublasDnrm2_64 cublasDnrm2_v2_64
285
+ #define cublasScnrm2_64 cublasScnrm2_v2_64
286
+ #define cublasDznrm2_64 cublasDznrm2_v2_64
287
+
288
+ #define cublasSdot_64 cublasSdot_v2_64
289
+ #define cublasDdot_64 cublasDdot_v2_64
290
+ #define cublasCdotu_64 cublasCdotu_v2_64
291
+ #define cublasCdotc_64 cublasCdotc_v2_64
292
+ #define cublasZdotu_64 cublasZdotu_v2_64
293
+ #define cublasZdotc_64 cublasZdotc_v2_64
294
+
295
+ #define cublasSscal_64 cublasSscal_v2_64
296
+ #define cublasDscal_64 cublasDscal_v2_64
297
+ #define cublasCscal_64 cublasCscal_v2_64
298
+ #define cublasCsscal_64 cublasCsscal_v2_64
299
+ #define cublasZscal_64 cublasZscal_v2_64
300
+ #define cublasZdscal_64 cublasZdscal_v2_64
301
+
302
+ #define cublasSaxpy_64 cublasSaxpy_v2_64
303
+ #define cublasDaxpy_64 cublasDaxpy_v2_64
304
+ #define cublasCaxpy_64 cublasCaxpy_v2_64
305
+ #define cublasZaxpy_64 cublasZaxpy_v2_64
306
+
307
+ #define cublasScopy_64 cublasScopy_v2_64
308
+ #define cublasDcopy_64 cublasDcopy_v2_64
309
+ #define cublasCcopy_64 cublasCcopy_v2_64
310
+ #define cublasZcopy_64 cublasZcopy_v2_64
311
+
312
+ #define cublasSswap_64 cublasSswap_v2_64
313
+ #define cublasDswap_64 cublasDswap_v2_64
314
+ #define cublasCswap_64 cublasCswap_v2_64
315
+ #define cublasZswap_64 cublasZswap_v2_64
316
+
317
+ #define cublasIsamax_64 cublasIsamax_v2_64
318
+ #define cublasIdamax_64 cublasIdamax_v2_64
319
+ #define cublasIcamax_64 cublasIcamax_v2_64
320
+ #define cublasIzamax_64 cublasIzamax_v2_64
321
+
322
+ #define cublasIsamin_64 cublasIsamin_v2_64
323
+ #define cublasIdamin_64 cublasIdamin_v2_64
324
+ #define cublasIcamin_64 cublasIcamin_v2_64
325
+ #define cublasIzamin_64 cublasIzamin_v2_64
326
+
327
+ #define cublasSasum_64 cublasSasum_v2_64
328
+ #define cublasDasum_64 cublasDasum_v2_64
329
+ #define cublasScasum_64 cublasScasum_v2_64
330
+ #define cublasDzasum_64 cublasDzasum_v2_64
331
+
332
+ #define cublasSrot_64 cublasSrot_v2_64
333
+ #define cublasDrot_64 cublasDrot_v2_64
334
+ #define cublasCrot_64 cublasCrot_v2_64
335
+ #define cublasCsrot_64 cublasCsrot_v2_64
336
+ #define cublasZrot_64 cublasZrot_v2_64
337
+ #define cublasZdrot_64 cublasZdrot_v2_64
338
+
339
+ #define cublasSrotg_64 cublasSrotg_v2_64
340
+ #define cublasDrotg_64 cublasDrotg_v2_64
341
+ #define cublasCrotg_64 cublasCrotg_v2_64
342
+ #define cublasZrotg_64 cublasZrotg_v2_64
343
+
344
+ #define cublasSrotm_64 cublasSrotm_v2_64
345
+ #define cublasDrotm_64 cublasDrotm_v2_64
346
+
347
+ #define cublasSrotmg_64 cublasSrotmg_v2_64
348
+ #define cublasDrotmg_64 cublasDrotmg_v2_64
349
+
350
+ /* Blas2 Routines */
351
+
352
+ #define cublasSgemv_64 cublasSgemv_v2_64
353
+ #define cublasDgemv_64 cublasDgemv_v2_64
354
+ #define cublasCgemv_64 cublasCgemv_v2_64
355
+ #define cublasZgemv_64 cublasZgemv_v2_64
356
+
357
+ #define cublasSgbmv_64 cublasSgbmv_v2_64
358
+ #define cublasDgbmv_64 cublasDgbmv_v2_64
359
+ #define cublasCgbmv_64 cublasCgbmv_v2_64
360
+ #define cublasZgbmv_64 cublasZgbmv_v2_64
361
+
362
+ #define cublasStrmv_64 cublasStrmv_v2_64
363
+ #define cublasDtrmv_64 cublasDtrmv_v2_64
364
+ #define cublasCtrmv_64 cublasCtrmv_v2_64
365
+ #define cublasZtrmv_64 cublasZtrmv_v2_64
366
+
367
+ #define cublasStbmv_64 cublasStbmv_v2_64
368
+ #define cublasDtbmv_64 cublasDtbmv_v2_64
369
+ #define cublasCtbmv_64 cublasCtbmv_v2_64
370
+ #define cublasZtbmv_64 cublasZtbmv_v2_64
371
+
372
+ #define cublasStpmv_64 cublasStpmv_v2_64
373
+ #define cublasDtpmv_64 cublasDtpmv_v2_64
374
+ #define cublasCtpmv_64 cublasCtpmv_v2_64
375
+ #define cublasZtpmv_64 cublasZtpmv_v2_64
376
+
377
+ #define cublasStrsv_64 cublasStrsv_v2_64
378
+ #define cublasDtrsv_64 cublasDtrsv_v2_64
379
+ #define cublasCtrsv_64 cublasCtrsv_v2_64
380
+ #define cublasZtrsv_64 cublasZtrsv_v2_64
381
+
382
+ #define cublasStpsv_64 cublasStpsv_v2_64
383
+ #define cublasDtpsv_64 cublasDtpsv_v2_64
384
+ #define cublasCtpsv_64 cublasCtpsv_v2_64
385
+ #define cublasZtpsv_64 cublasZtpsv_v2_64
386
+
387
+ #define cublasStbsv_64 cublasStbsv_v2_64
388
+ #define cublasDtbsv_64 cublasDtbsv_v2_64
389
+ #define cublasCtbsv_64 cublasCtbsv_v2_64
390
+ #define cublasZtbsv_64 cublasZtbsv_v2_64
391
+
392
+ #define cublasSsymv_64 cublasSsymv_v2_64
393
+ #define cublasDsymv_64 cublasDsymv_v2_64
394
+ #define cublasCsymv_64 cublasCsymv_v2_64
395
+ #define cublasZsymv_64 cublasZsymv_v2_64
396
+ #define cublasChemv_64 cublasChemv_v2_64
397
+ #define cublasZhemv_64 cublasZhemv_v2_64
398
+
399
+ #define cublasSsbmv_64 cublasSsbmv_v2_64
400
+ #define cublasDsbmv_64 cublasDsbmv_v2_64
401
+ #define cublasChbmv_64 cublasChbmv_v2_64
402
+ #define cublasZhbmv_64 cublasZhbmv_v2_64
403
+
404
+ #define cublasSspmv_64 cublasSspmv_v2_64
405
+ #define cublasDspmv_64 cublasDspmv_v2_64
406
+ #define cublasChpmv_64 cublasChpmv_v2_64
407
+ #define cublasZhpmv_64 cublasZhpmv_v2_64
408
+
409
+ #define cublasSger_64 cublasSger_v2_64
410
+ #define cublasDger_64 cublasDger_v2_64
411
+ #define cublasCgeru_64 cublasCgeru_v2_64
412
+ #define cublasCgerc_64 cublasCgerc_v2_64
413
+ #define cublasZgeru_64 cublasZgeru_v2_64
414
+ #define cublasZgerc_64 cublasZgerc_v2_64
415
+
416
+ #define cublasSsyr_64 cublasSsyr_v2_64
417
+ #define cublasDsyr_64 cublasDsyr_v2_64
418
+ #define cublasCsyr_64 cublasCsyr_v2_64
419
+ #define cublasZsyr_64 cublasZsyr_v2_64
420
+ #define cublasCher_64 cublasCher_v2_64
421
+ #define cublasZher_64 cublasZher_v2_64
422
+
423
+ #define cublasSspr_64 cublasSspr_v2_64
424
+ #define cublasDspr_64 cublasDspr_v2_64
425
+ #define cublasChpr_64 cublasChpr_v2_64
426
+ #define cublasZhpr_64 cublasZhpr_v2_64
427
+
428
+ #define cublasSsyr2_64 cublasSsyr2_v2_64
429
+ #define cublasDsyr2_64 cublasDsyr2_v2_64
430
+ #define cublasCsyr2_64 cublasCsyr2_v2_64
431
+ #define cublasZsyr2_64 cublasZsyr2_v2_64
432
+ #define cublasCher2_64 cublasCher2_v2_64
433
+ #define cublasZher2_64 cublasZher2_v2_64
434
+
435
+ #define cublasSspr2_64 cublasSspr2_v2_64
436
+ #define cublasDspr2_64 cublasDspr2_v2_64
437
+ #define cublasChpr2_64 cublasChpr2_v2_64
438
+ #define cublasZhpr2_64 cublasZhpr2_v2_64
439
+
440
+ /* Blas3 Routines */
441
+
442
+ #define cublasSgemm_64 cublasSgemm_v2_64
443
+ #define cublasDgemm_64 cublasDgemm_v2_64
444
+ #define cublasCgemm_64 cublasCgemm_v2_64
445
+ #define cublasZgemm_64 cublasZgemm_v2_64
446
+
447
+ #define cublasSsyrk_64 cublasSsyrk_v2_64
448
+ #define cublasDsyrk_64 cublasDsyrk_v2_64
449
+ #define cublasCsyrk_64 cublasCsyrk_v2_64
450
+ #define cublasZsyrk_64 cublasZsyrk_v2_64
451
+ #define cublasCherk_64 cublasCherk_v2_64
452
+ #define cublasZherk_64 cublasZherk_v2_64
453
+
454
+ #define cublasSsyr2k_64 cublasSsyr2k_v2_64
455
+ #define cublasDsyr2k_64 cublasDsyr2k_v2_64
456
+ #define cublasCsyr2k_64 cublasCsyr2k_v2_64
457
+ #define cublasZsyr2k_64 cublasZsyr2k_v2_64
458
+ #define cublasCher2k_64 cublasCher2k_v2_64
459
+ #define cublasZher2k_64 cublasZher2k_v2_64
460
+
461
+ #define cublasSsymm_64 cublasSsymm_v2_64
462
+ #define cublasDsymm_64 cublasDsymm_v2_64
463
+ #define cublasCsymm_64 cublasCsymm_v2_64
464
+ #define cublasZsymm_64 cublasZsymm_v2_64
465
+ #define cublasChemm_64 cublasChemm_v2_64
466
+ #define cublasZhemm_64 cublasZhemm_v2_64
467
+
468
+ #define cublasStrsm_64 cublasStrsm_v2_64
469
+ #define cublasDtrsm_64 cublasDtrsm_v2_64
470
+ #define cublasCtrsm_64 cublasCtrsm_v2_64
471
+ #define cublasZtrsm_64 cublasZtrsm_v2_64
472
+
473
+ #define cublasStrmm_64 cublasStrmm_v2_64
474
+ #define cublasDtrmm_64 cublasDtrmm_v2_64
475
+ #define cublasCtrmm_64 cublasCtrmm_v2_64
476
+ #define cublasZtrmm_64 cublasZtrmm_v2_64
477
+
478
+ #endif /* !defined(CUBLAS_V2_H_) */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/include/nvblas.h ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(NVBLAS_H_)
51
+ #define NVBLAS_H_
52
+
53
+ #include "driver_types.h"
54
+ #include "cuComplex.h" /* import complex data type */
55
+
56
+ #if defined(__cplusplus)
57
+ extern "C" {
58
+ #endif
59
+
60
+ /* GEMM */
61
+ void sgemm_(const char* transa,
62
+ const char* transb,
63
+ const int* m,
64
+ const int* n,
65
+ const int* k,
66
+ const float* alpha,
67
+ const float* a,
68
+ const int* lda,
69
+ const float* b,
70
+ const int* ldb,
71
+ const float* beta,
72
+ float* c,
73
+ const int* ldc);
74
+
75
+ void dgemm_(const char* transa,
76
+ const char* transb,
77
+ const int* m,
78
+ const int* n,
79
+ const int* k,
80
+ const double* alpha,
81
+ const double* a,
82
+ const int* lda,
83
+ const double* b,
84
+ const int* ldb,
85
+ const double* beta,
86
+ double* c,
87
+ const int* ldc);
88
+
89
+ void cgemm_(const char* transa,
90
+ const char* transb,
91
+ const int* m,
92
+ const int* n,
93
+ const int* k,
94
+ const cuComplex* alpha,
95
+ const cuComplex* a,
96
+ const int* lda,
97
+ const cuComplex* b,
98
+ const int* ldb,
99
+ const cuComplex* beta,
100
+ cuComplex* c,
101
+ const int* ldc);
102
+
103
+ void zgemm_(const char* transa,
104
+ const char* transb,
105
+ const int* m,
106
+ const int* n,
107
+ const int* k,
108
+ const cuDoubleComplex* alpha,
109
+ const cuDoubleComplex* a,
110
+ const int* lda,
111
+ const cuDoubleComplex* b,
112
+ const int* ldb,
113
+ const cuDoubleComplex* beta,
114
+ cuDoubleComplex* c,
115
+ const int* ldc);
116
+
117
+ void sgemm(const char* transa,
118
+ const char* transb,
119
+ const int* m,
120
+ const int* n,
121
+ const int* k,
122
+ const float* alpha,
123
+ const float* a,
124
+ const int* lda,
125
+ const float* b,
126
+ const int* ldb,
127
+ const float* beta,
128
+ float* c,
129
+ const int* ldc);
130
+
131
+ void dgemm(const char* transa,
132
+ const char* transb,
133
+ const int* m,
134
+ const int* n,
135
+ const int* k,
136
+ const double* alpha,
137
+ const double* a,
138
+ const int* lda,
139
+ const double* b,
140
+ const int* ldb,
141
+ const double* beta,
142
+ double* c,
143
+ const int* ldc);
144
+
145
+ void cgemm(const char* transa,
146
+ const char* transb,
147
+ const int* m,
148
+ const int* n,
149
+ const int* k,
150
+ const cuComplex* alpha,
151
+ const cuComplex* a,
152
+ const int* lda,
153
+ const cuComplex* b,
154
+ const int* ldb,
155
+ const cuComplex* beta,
156
+ cuComplex* c,
157
+ const int* ldc);
158
+
159
+ void zgemm(const char* transa,
160
+ const char* transb,
161
+ const int* m,
162
+ const int* n,
163
+ const int* k,
164
+ const cuDoubleComplex* alpha,
165
+ const cuDoubleComplex* a,
166
+ const int* lda,
167
+ const cuDoubleComplex* b,
168
+ const int* ldb,
169
+ const cuDoubleComplex* beta,
170
+ cuDoubleComplex* c,
171
+ const int* ldc);
172
+
173
+ /* SYRK */
174
+ void ssyrk_(const char* uplo,
175
+ const char* trans,
176
+ const int* n,
177
+ const int* k,
178
+ const float* alpha,
179
+ const float* a,
180
+ const int* lda,
181
+ const float* beta,
182
+ float* c,
183
+ const int* ldc);
184
+
185
+ void dsyrk_(const char* uplo,
186
+ const char* trans,
187
+ const int* n,
188
+ const int* k,
189
+ const double* alpha,
190
+ const double* a,
191
+ const int* lda,
192
+ const double* beta,
193
+ double* c,
194
+ const int* ldc);
195
+
196
+ void csyrk_(const char* uplo,
197
+ const char* trans,
198
+ const int* n,
199
+ const int* k,
200
+ const cuComplex* alpha,
201
+ const cuComplex* a,
202
+ const int* lda,
203
+ const cuComplex* beta,
204
+ cuComplex* c,
205
+ const int* ldc);
206
+
207
+ void zsyrk_(const char* uplo,
208
+ const char* trans,
209
+ const int* n,
210
+ const int* k,
211
+ const cuDoubleComplex* alpha,
212
+ const cuDoubleComplex* a,
213
+ const int* lda,
214
+ const cuDoubleComplex* beta,
215
+ cuDoubleComplex* c,
216
+ const int* ldc);
217
+
218
+ void ssyrk(const char* uplo,
219
+ const char* trans,
220
+ const int* n,
221
+ const int* k,
222
+ const float* alpha,
223
+ const float* a,
224
+ const int* lda,
225
+ const float* beta,
226
+ float* c,
227
+ const int* ldc);
228
+
229
+ void dsyrk(const char* uplo,
230
+ const char* trans,
231
+ const int* n,
232
+ const int* k,
233
+ const double* alpha,
234
+ const double* a,
235
+ const int* lda,
236
+ const double* beta,
237
+ double* c,
238
+ const int* ldc);
239
+
240
+ void csyrk(const char* uplo,
241
+ const char* trans,
242
+ const int* n,
243
+ const int* k,
244
+ const cuComplex* alpha,
245
+ const cuComplex* a,
246
+ const int* lda,
247
+ const cuComplex* beta,
248
+ cuComplex* c,
249
+ const int* ldc);
250
+
251
+ void zsyrk(const char* uplo,
252
+ const char* trans,
253
+ const int* n,
254
+ const int* k,
255
+ const cuDoubleComplex* alpha,
256
+ const cuDoubleComplex* a,
257
+ const int* lda,
258
+ const cuDoubleComplex* beta,
259
+ cuDoubleComplex* c,
260
+ const int* ldc);
261
+
262
+ /* HERK */
263
+ void cherk_(const char* uplo,
264
+ const char* trans,
265
+ const int* n,
266
+ const int* k,
267
+ const float* alpha,
268
+ const cuComplex* a,
269
+ const int* lda,
270
+ const float* beta,
271
+ cuComplex* c,
272
+ const int* ldc);
273
+
274
+ void zherk_(const char* uplo,
275
+ const char* trans,
276
+ const int* n,
277
+ const int* k,
278
+ const double* alpha,
279
+ const cuDoubleComplex* a,
280
+ const int* lda,
281
+ const double* beta,
282
+ cuDoubleComplex* c,
283
+ const int* ldc);
284
+
285
+ void cherk(const char* uplo,
286
+ const char* trans,
287
+ const int* n,
288
+ const int* k,
289
+ const float* alpha,
290
+ const cuComplex* a,
291
+ const int* lda,
292
+ const float* beta,
293
+ cuComplex* c,
294
+ const int* ldc);
295
+
296
+ void zherk(const char* uplo,
297
+ const char* trans,
298
+ const int* n,
299
+ const int* k,
300
+ const double* alpha,
301
+ const cuDoubleComplex* a,
302
+ const int* lda,
303
+ const double* beta,
304
+ cuDoubleComplex* c,
305
+ const int* ldc);
306
+
307
+ /* TRSM */
308
+ void strsm_(const char* side,
309
+ const char* uplo,
310
+ const char* transa,
311
+ const char* diag,
312
+ const int* m,
313
+ const int* n,
314
+ const float* alpha,
315
+ const float* a,
316
+ const int* lda,
317
+ float* b,
318
+ const int* ldb);
319
+
320
+ void dtrsm_(const char* side,
321
+ const char* uplo,
322
+ const char* transa,
323
+ const char* diag,
324
+ const int* m,
325
+ const int* n,
326
+ const double* alpha,
327
+ const double* a,
328
+ const int* lda,
329
+ double* b,
330
+ const int* ldb);
331
+
332
+ void ctrsm_(const char* side,
333
+ const char* uplo,
334
+ const char* transa,
335
+ const char* diag,
336
+ const int* m,
337
+ const int* n,
338
+ const cuComplex* alpha,
339
+ const cuComplex* a,
340
+ const int* lda,
341
+ cuComplex* b,
342
+ const int* ldb);
343
+
344
+ void ztrsm_(const char* side,
345
+ const char* uplo,
346
+ const char* transa,
347
+ const char* diag,
348
+ const int* m,
349
+ const int* n,
350
+ const cuDoubleComplex* alpha,
351
+ const cuDoubleComplex* a,
352
+ const int* lda,
353
+ cuDoubleComplex* b,
354
+ const int* ldb);
355
+
356
+ void strsm(const char* side,
357
+ const char* uplo,
358
+ const char* transa,
359
+ const char* diag,
360
+ const int* m,
361
+ const int* n,
362
+ const float* alpha,
363
+ const float* a,
364
+ const int* lda,
365
+ float* b,
366
+ const int* ldb);
367
+
368
+ void dtrsm(const char* side,
369
+ const char* uplo,
370
+ const char* transa,
371
+ const char* diag,
372
+ const int* m,
373
+ const int* n,
374
+ const double* alpha,
375
+ const double* a,
376
+ const int* lda,
377
+ double* b,
378
+ const int* ldb);
379
+
380
+ void ctrsm(const char* side,
381
+ const char* uplo,
382
+ const char* transa,
383
+ const char* diag,
384
+ const int* m,
385
+ const int* n,
386
+ const cuComplex* alpha,
387
+ const cuComplex* a,
388
+ const int* lda,
389
+ cuComplex* b,
390
+ const int* ldb);
391
+
392
+ void ztrsm(const char* side,
393
+ const char* uplo,
394
+ const char* transa,
395
+ const char* diag,
396
+ const int* m,
397
+ const int* n,
398
+ const cuDoubleComplex* alpha,
399
+ const cuDoubleComplex* a,
400
+ const int* lda,
401
+ cuDoubleComplex* b,
402
+ const int* ldb);
403
+
404
+ /* SYMM */
405
+ void ssymm_(const char* side,
406
+ const char* uplo,
407
+ const int* m,
408
+ const int* n,
409
+ const float* alpha,
410
+ const float* a,
411
+ const int* lda,
412
+ const float* b,
413
+ const int* ldb,
414
+ const float* beta,
415
+ float* c,
416
+ const int* ldc);
417
+
418
+ void dsymm_(const char* side,
419
+ const char* uplo,
420
+ const int* m,
421
+ const int* n,
422
+ const double* alpha,
423
+ const double* a,
424
+ const int* lda,
425
+ const double* b,
426
+ const int* ldb,
427
+ const double* beta,
428
+ double* c,
429
+ const int* ldc);
430
+
431
+ void csymm_(const char* side,
432
+ const char* uplo,
433
+ const int* m,
434
+ const int* n,
435
+ const cuComplex* alpha,
436
+ const cuComplex* a,
437
+ const int* lda,
438
+ const cuComplex* b,
439
+ const int* ldb,
440
+ const cuComplex* beta,
441
+ cuComplex* c,
442
+ const int* ldc);
443
+
444
+ void zsymm_(const char* side,
445
+ const char* uplo,
446
+ const int* m,
447
+ const int* n,
448
+ const cuDoubleComplex* alpha,
449
+ const cuDoubleComplex* a,
450
+ const int* lda,
451
+ const cuDoubleComplex* b,
452
+ const int* ldb,
453
+ const cuDoubleComplex* beta,
454
+ cuDoubleComplex* c,
455
+ const int* ldc);
456
+
457
+ void ssymm(const char* side,
458
+ const char* uplo,
459
+ const int* m,
460
+ const int* n,
461
+ const float* alpha,
462
+ const float* a,
463
+ const int* lda,
464
+ const float* b,
465
+ const int* ldb,
466
+ const float* beta,
467
+ float* c,
468
+ const int* ldc);
469
+
470
+ void dsymm(const char* side,
471
+ const char* uplo,
472
+ const int* m,
473
+ const int* n,
474
+ const double* alpha,
475
+ const double* a,
476
+ const int* lda,
477
+ const double* b,
478
+ const int* ldb,
479
+ const double* beta,
480
+ double* c,
481
+ const int* ldc);
482
+
483
+ void csymm(const char* side,
484
+ const char* uplo,
485
+ const int* m,
486
+ const int* n,
487
+ const cuComplex* alpha,
488
+ const cuComplex* a,
489
+ const int* lda,
490
+ const cuComplex* b,
491
+ const int* ldb,
492
+ const cuComplex* beta,
493
+ cuComplex* c,
494
+ const int* ldc);
495
+
496
+ void zsymm(const char* side,
497
+ const char* uplo,
498
+ const int* m,
499
+ const int* n,
500
+ const cuDoubleComplex* alpha,
501
+ const cuDoubleComplex* a,
502
+ const int* lda,
503
+ const cuDoubleComplex* b,
504
+ const int* ldb,
505
+ const cuDoubleComplex* beta,
506
+ cuDoubleComplex* c,
507
+ const int* ldc);
508
+
509
+ /* HEMM */
510
+ void chemm_(const char* side,
511
+ const char* uplo,
512
+ const int* m,
513
+ const int* n,
514
+ const cuComplex* alpha,
515
+ const cuComplex* a,
516
+ const int* lda,
517
+ const cuComplex* b,
518
+ const int* ldb,
519
+ const cuComplex* beta,
520
+ cuComplex* c,
521
+ const int* ldc);
522
+
523
+ void zhemm_(const char* side,
524
+ const char* uplo,
525
+ const int* m,
526
+ const int* n,
527
+ const cuDoubleComplex* alpha,
528
+ const cuDoubleComplex* a,
529
+ const int* lda,
530
+ const cuDoubleComplex* b,
531
+ const int* ldb,
532
+ const cuDoubleComplex* beta,
533
+ cuDoubleComplex* c,
534
+ const int* ldc);
535
+
536
+ /* HEMM with no underscore*/
537
+ void chemm(const char* side,
538
+ const char* uplo,
539
+ const int* m,
540
+ const int* n,
541
+ const cuComplex* alpha,
542
+ const cuComplex* a,
543
+ const int* lda,
544
+ const cuComplex* b,
545
+ const int* ldb,
546
+ const cuComplex* beta,
547
+ cuComplex* c,
548
+ const int* ldc);
549
+
550
+ void zhemm(const char* side,
551
+ const char* uplo,
552
+ const int* m,
553
+ const int* n,
554
+ const cuDoubleComplex* alpha,
555
+ const cuDoubleComplex* a,
556
+ const int* lda,
557
+ const cuDoubleComplex* b,
558
+ const int* ldb,
559
+ const cuDoubleComplex* beta,
560
+ cuDoubleComplex* c,
561
+ const int* ldc);
562
+
563
+ /* SYR2K */
564
+ void ssyr2k_(const char* uplo,
565
+ const char* trans,
566
+ const int* n,
567
+ const int* k,
568
+ const float* alpha,
569
+ const float* a,
570
+ const int* lda,
571
+ const float* b,
572
+ const int* ldb,
573
+ const float* beta,
574
+ float* c,
575
+ const int* ldc);
576
+
577
+ void dsyr2k_(const char* uplo,
578
+ const char* trans,
579
+ const int* n,
580
+ const int* k,
581
+ const double* alpha,
582
+ const double* a,
583
+ const int* lda,
584
+ const double* b,
585
+ const int* ldb,
586
+ const double* beta,
587
+ double* c,
588
+ const int* ldc);
589
+
590
+ void csyr2k_(const char* uplo,
591
+ const char* trans,
592
+ const int* n,
593
+ const int* k,
594
+ const cuComplex* alpha,
595
+ const cuComplex* a,
596
+ const int* lda,
597
+ const cuComplex* b,
598
+ const int* ldb,
599
+ const cuComplex* beta,
600
+ cuComplex* c,
601
+ const int* ldc);
602
+
603
+ void zsyr2k_(const char* uplo,
604
+ const char* trans,
605
+ const int* n,
606
+ const int* k,
607
+ const cuDoubleComplex* alpha,
608
+ const cuDoubleComplex* a,
609
+ const int* lda,
610
+ const cuDoubleComplex* b,
611
+ const int* ldb,
612
+ const cuDoubleComplex* beta,
613
+ cuDoubleComplex* c,
614
+ const int* ldc);
615
+
616
+ /* SYR2K no_underscore*/
617
+ void ssyr2k(const char* uplo,
618
+ const char* trans,
619
+ const int* n,
620
+ const int* k,
621
+ const float* alpha,
622
+ const float* a,
623
+ const int* lda,
624
+ const float* b,
625
+ const int* ldb,
626
+ const float* beta,
627
+ float* c,
628
+ const int* ldc);
629
+
630
+ void dsyr2k(const char* uplo,
631
+ const char* trans,
632
+ const int* n,
633
+ const int* k,
634
+ const double* alpha,
635
+ const double* a,
636
+ const int* lda,
637
+ const double* b,
638
+ const int* ldb,
639
+ const double* beta,
640
+ double* c,
641
+ const int* ldc);
642
+
643
+ void csyr2k(const char* uplo,
644
+ const char* trans,
645
+ const int* n,
646
+ const int* k,
647
+ const cuComplex* alpha,
648
+ const cuComplex* a,
649
+ const int* lda,
650
+ const cuComplex* b,
651
+ const int* ldb,
652
+ const cuComplex* beta,
653
+ cuComplex* c,
654
+ const int* ldc);
655
+
656
+ void zsyr2k(const char* uplo,
657
+ const char* trans,
658
+ const int* n,
659
+ const int* k,
660
+ const cuDoubleComplex* alpha,
661
+ const cuDoubleComplex* a,
662
+ const int* lda,
663
+ const cuDoubleComplex* b,
664
+ const int* ldb,
665
+ const cuDoubleComplex* beta,
666
+ cuDoubleComplex* c,
667
+ const int* ldc);
668
+
669
+ /* HERK */
670
+ void cher2k_(const char* uplo,
671
+ const char* trans,
672
+ const int* n,
673
+ const int* k,
674
+ const cuComplex* alpha,
675
+ const cuComplex* a,
676
+ const int* lda,
677
+ const cuComplex* b,
678
+ const int* ldb,
679
+ const float* beta,
680
+ cuComplex* c,
681
+ const int* ldc);
682
+
683
+ void zher2k_(const char* uplo,
684
+ const char* trans,
685
+ const int* n,
686
+ const int* k,
687
+ const cuDoubleComplex* alpha,
688
+ const cuDoubleComplex* a,
689
+ const int* lda,
690
+ const cuDoubleComplex* b,
691
+ const int* ldb,
692
+ const double* beta,
693
+ cuDoubleComplex* c,
694
+ const int* ldc);
695
+
696
+ /* HER2K with no underscore */
697
+ void cher2k(const char* uplo,
698
+ const char* trans,
699
+ const int* n,
700
+ const int* k,
701
+ const cuComplex* alpha,
702
+ const cuComplex* a,
703
+ const int* lda,
704
+ const cuComplex* b,
705
+ const int* ldb,
706
+ const float* beta,
707
+ cuComplex* c,
708
+ const int* ldc);
709
+
710
+ void zher2k(const char* uplo,
711
+ const char* trans,
712
+ const int* n,
713
+ const int* k,
714
+ const cuDoubleComplex* alpha,
715
+ const cuDoubleComplex* a,
716
+ const int* lda,
717
+ const cuDoubleComplex* b,
718
+ const int* ldb,
719
+ const double* beta,
720
+ cuDoubleComplex* c,
721
+ const int* ldc);
722
+
723
+ /* TRMM */
724
+ void strmm_(const char* side,
725
+ const char* uplo,
726
+ const char* transa,
727
+ const char* diag,
728
+ const int* m,
729
+ const int* n,
730
+ const float* alpha,
731
+ const float* a,
732
+ const int* lda,
733
+ float* b,
734
+ const int* ldb);
735
+
736
+ void dtrmm_(const char* side,
737
+ const char* uplo,
738
+ const char* transa,
739
+ const char* diag,
740
+ const int* m,
741
+ const int* n,
742
+ const double* alpha,
743
+ const double* a,
744
+ const int* lda,
745
+ double* b,
746
+ const int* ldb);
747
+
748
+ void ctrmm_(const char* side,
749
+ const char* uplo,
750
+ const char* transa,
751
+ const char* diag,
752
+ const int* m,
753
+ const int* n,
754
+ const cuComplex* alpha,
755
+ const cuComplex* a,
756
+ const int* lda,
757
+ cuComplex* b,
758
+ const int* ldb);
759
+
760
+ void ztrmm_(const char* side,
761
+ const char* uplo,
762
+ const char* transa,
763
+ const char* diag,
764
+ const int* m,
765
+ const int* n,
766
+ const cuDoubleComplex* alpha,
767
+ const cuDoubleComplex* a,
768
+ const int* lda,
769
+ cuDoubleComplex* b,
770
+ const int* ldb);
771
+
772
+ void strmm(const char* side,
773
+ const char* uplo,
774
+ const char* transa,
775
+ const char* diag,
776
+ const int* m,
777
+ const int* n,
778
+ const float* alpha,
779
+ const float* a,
780
+ const int* lda,
781
+ float* b,
782
+ const int* ldb);
783
+
784
+ void dtrmm(const char* side,
785
+ const char* uplo,
786
+ const char* transa,
787
+ const char* diag,
788
+ const int* m,
789
+ const int* n,
790
+ const double* alpha,
791
+ const double* a,
792
+ const int* lda,
793
+ double* b,
794
+ const int* ldb);
795
+
796
+ void ctrmm(const char* side,
797
+ const char* uplo,
798
+ const char* transa,
799
+ const char* diag,
800
+ const int* m,
801
+ const int* n,
802
+ const cuComplex* alpha,
803
+ const cuComplex* a,
804
+ const int* lda,
805
+ cuComplex* b,
806
+ const int* ldb);
807
+
808
+ void ztrmm(const char* side,
809
+ const char* uplo,
810
+ const char* transa,
811
+ const char* diag,
812
+ const int* m,
813
+ const int* n,
814
+ const cuDoubleComplex* alpha,
815
+ const cuDoubleComplex* a,
816
+ const int* lda,
817
+ cuDoubleComplex* b,
818
+ const int* ldb);
819
+
820
+ #if defined(__cplusplus)
821
+ }
822
+ #endif /* __cplusplus */
823
+
824
+ #endif /* !defined(NVBLAS_H_) */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (223 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_cupti/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (223 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (231 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h ADDED
@@ -0,0 +1,1141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // NVIDIA_COPYRIGHT_BEGIN
3
+ //
4
+ // Copyright (c) 2014-2024, NVIDIA CORPORATION. All rights reserved.
5
+ //
6
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ // and proprietary rights in and to this software, related documentation
8
+ // and any modifications thereto. Any use, reproduction, disclosure or
9
+ // distribution of this software and related documentation without an express
10
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ //
12
+ // NVIDIA_COPYRIGHT_END
13
+ //
14
+
15
+ #ifndef __NVRTC_H__
16
+ #define __NVRTC_H__
17
+
18
+ #ifdef __cplusplus
19
+ extern "C" {
20
+ #endif /* __cplusplus */
21
+
22
+ #include <stdlib.h>
23
+
24
+
25
+ /*************************************************************************//**
26
+ *
27
+ * \defgroup error Error Handling
28
+ *
29
+ * NVRTC defines the following enumeration type and function for API call
30
+ * error handling.
31
+ *
32
+ ****************************************************************************/
33
+
34
+
35
+ /**
36
+ * \ingroup error
37
+ * \brief The enumerated type nvrtcResult defines API call result codes.
38
+ * NVRTC API functions return nvrtcResult to indicate the call
39
+ * result.
40
+ */
41
+ typedef enum {
42
+ NVRTC_SUCCESS = 0,
43
+ NVRTC_ERROR_OUT_OF_MEMORY = 1,
44
+ NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
45
+ NVRTC_ERROR_INVALID_INPUT = 3,
46
+ NVRTC_ERROR_INVALID_PROGRAM = 4,
47
+ NVRTC_ERROR_INVALID_OPTION = 5,
48
+ NVRTC_ERROR_COMPILATION = 6,
49
+ NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
50
+ NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
51
+ NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
52
+ NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
53
+ NVRTC_ERROR_INTERNAL_ERROR = 11,
54
+ NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12,
55
+ NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED = 13,
56
+ NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED = 14,
57
+ NVRTC_ERROR_PCH_CREATE = 15,
58
+ NVRTC_ERROR_CANCELLED = 16
59
+ } nvrtcResult;
60
+
61
+
62
+ /**
63
+ * \ingroup error
64
+ * \brief nvrtcGetErrorString is a helper function that returns a string
65
+ * describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
66
+ * \c "NVRTC_SUCCESS".
67
+ * For unrecognized enumeration values, it returns
68
+ * \c "NVRTC_ERROR unknown".
69
+ *
70
+ * \param [in] result CUDA Runtime Compilation API result code.
71
+ * \return Message string for the given #nvrtcResult code.
72
+ */
73
+ const char *nvrtcGetErrorString(nvrtcResult result);
74
+
75
+
76
+ /*************************************************************************//**
77
+ *
78
+ * \defgroup query General Information Query
79
+ *
80
+ * NVRTC defines the following function for general information query.
81
+ *
82
+ ****************************************************************************/
83
+
84
+
85
+ /**
86
+ * \ingroup query
87
+ * \brief nvrtcVersion sets the output parameters \p major and \p minor
88
+ * with the CUDA Runtime Compilation version number.
89
+ *
90
+ * \param [out] major CUDA Runtime Compilation major version number.
91
+ * \param [out] minor CUDA Runtime Compilation minor version number.
92
+ * \return
93
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
94
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
95
+ *
96
+ */
97
+ nvrtcResult nvrtcVersion(int *major, int *minor);
98
+
99
+
100
+ /**
101
+ * \ingroup query
102
+ * \brief nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
103
+ * with the number of architectures supported by NVRTC. This can
104
+ * then be used to pass an array to ::nvrtcGetSupportedArchs to
105
+ * get the supported architectures.
106
+ *
107
+ * \param [out] numArchs number of supported architectures.
108
+ * \return
109
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
110
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
111
+ *
112
+ * see ::nvrtcGetSupportedArchs
113
+ */
114
+ nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
115
+
116
+
117
+ /**
118
+ * \ingroup query
119
+ * \brief nvrtcGetSupportedArchs populates the array passed via the output parameter
120
+ * \p supportedArchs with the architectures supported by NVRTC. The array is
121
+ * sorted in the ascending order. The size of the array to be passed can be
122
+ * determined using ::nvrtcGetNumSupportedArchs.
123
+ *
124
+ * \param [out] supportedArchs sorted array of supported architectures.
125
+ * \return
126
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
127
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
128
+ *
129
+ * see ::nvrtcGetNumSupportedArchs
130
+ */
131
+ nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
132
+
133
+
134
+ /*************************************************************************//**
135
+ *
136
+ * \defgroup compilation Compilation
137
+ *
138
+ * NVRTC defines the following type and functions for actual compilation.
139
+ *
140
+ ****************************************************************************/
141
+
142
+
143
+ /**
144
+ * \ingroup compilation
145
+ * \brief nvrtcProgram is the unit of compilation, and an opaque handle for
146
+ * a program.
147
+ *
148
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
149
+ * created first with ::nvrtcCreateProgram, then compiled with
150
+ * ::nvrtcCompileProgram.
151
+ */
152
+ typedef struct _nvrtcProgram *nvrtcProgram;
153
+
154
+
155
+ /**
156
+ * \ingroup compilation
157
+ * \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
158
+ * given input parameters, and sets the output parameter \p prog with
159
+ * it.
160
+ *
161
+ * \param [out] prog CUDA Runtime Compilation program.
162
+ * \param [in] src CUDA program source.
163
+ * \param [in] name CUDA program name.\n
164
+ * \p name can be \c NULL; \c "default_program" is
165
+ * used when \p name is \c NULL or "".
166
+ * \param [in] numHeaders Number of headers used.\n
167
+ * \p numHeaders must be greater than or equal to 0.
168
+ * \param [in] headers Sources of the headers.\n
169
+ * \p headers can be \c NULL when \p numHeaders is
170
+ * 0.
171
+ * \param [in] includeNames Name of each header by which they can be
172
+ * included in the CUDA program source.\n
173
+ * \p includeNames can be \c NULL when \p numHeaders
174
+ * is 0. These headers must be included with the exact
175
+ * names specified here.
176
+ * \return
177
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
178
+ * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
179
+ * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
180
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
181
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
182
+ *
183
+ * \see ::nvrtcDestroyProgram
184
+ */
185
+ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
186
+ const char *src,
187
+ const char *name,
188
+ int numHeaders,
189
+ const char * const *headers,
190
+ const char * const *includeNames);
191
+
192
+
193
+ /**
194
+ * \ingroup compilation
195
+ * \brief nvrtcDestroyProgram destroys the given program.
196
+ *
197
+ * \param [in] prog CUDA Runtime Compilation program.
198
+ * \return
199
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
200
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
201
+ *
202
+ * \see ::nvrtcCreateProgram
203
+ */
204
+ nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
205
+
206
+
207
+ /**
208
+ * \ingroup compilation
209
+ * \brief nvrtcCompileProgram compiles the given program.
210
+ *
211
+ * \param [in] prog CUDA Runtime Compilation program.
212
+ * \param [in] numOptions Number of compiler options passed.
213
+ * \param [in] options Compiler options in the form of C string array.\n
214
+ * \p options can be \c NULL when \p numOptions is 0.
215
+ *
216
+ * \return
217
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
218
+ * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
219
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
220
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
221
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
222
+ * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
223
+ * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
224
+ * - \link #nvrtcResult NVRTC_ERROR_TIME_FILE_WRITE_FAILED \endlink
225
+ * - \link #nvrtcResult NVRTC_ERROR_CANCELLED \endlink
226
+ *
227
+ * It supports compile options listed in \ref options.
228
+ */
229
+ nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
230
+ int numOptions, const char * const *options);
231
+
232
+
233
+ /**
234
+ * \ingroup compilation
235
+ * \brief nvrtcGetPTXSize sets the value of \p ptxSizeRet with the size of the PTX
236
+ * generated by the previous compilation of \p prog (including the
237
+ * trailing \c NULL).
238
+ *
239
+ * \param [in] prog CUDA Runtime Compilation program.
240
+ * \param [out] ptxSizeRet Size of the generated PTX (including the trailing
241
+ * \c NULL).
242
+ * \return
243
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
244
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
245
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
246
+ *
247
+ * \see ::nvrtcGetPTX
248
+ */
249
+ nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
250
+
251
+
252
+ /**
253
+ * \ingroup compilation
254
+ * \brief nvrtcGetPTX stores the PTX generated by the previous compilation
255
+ * of \p prog in the memory pointed by \p ptx.
256
+ *
257
+ * \param [in] prog CUDA Runtime Compilation program.
258
+ * \param [out] ptx Compiled result.
259
+ * \return
260
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
261
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
262
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
263
+ *
264
+ * \see ::nvrtcGetPTXSize
265
+ */
266
+ nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
267
+
268
+
269
+ /**
270
+ * \ingroup compilation
271
+ * \brief nvrtcGetCUBINSize sets the value of \p cubinSizeRet with the size of the cubin
272
+ * generated by the previous compilation of \p prog. The value of
273
+ * cubinSizeRet is set to 0 if the value specified to \c -arch is a
274
+ * virtual architecture instead of an actual architecture.
275
+ *
276
+ * \param [in] prog CUDA Runtime Compilation program.
277
+ * \param [out] cubinSizeRet Size of the generated cubin.
278
+ * \return
279
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
280
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
281
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
282
+ *
283
+ * \see ::nvrtcGetCUBIN
284
+ */
285
+ nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
286
+
287
+
288
+ /**
289
+ * \ingroup compilation
290
+ * \brief nvrtcGetCUBIN stores the cubin generated by the previous compilation
291
+ * of \p prog in the memory pointed by \p cubin. No cubin is available
292
+ * if the value specified to \c -arch is a virtual architecture instead
293
+ * of an actual architecture.
294
+ *
295
+ * \param [in] prog CUDA Runtime Compilation program.
296
+ * \param [out] cubin Compiled and assembled result.
297
+ * \return
298
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
299
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
300
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
301
+ *
302
+ * \see ::nvrtcGetCUBINSize
303
+ */
304
+ nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
305
+
306
+
307
+ #if defined(_WIN32)
308
+ # define __DEPRECATED__(msg) __declspec(deprecated(msg))
309
+ #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
310
+ # define __DEPRECATED__(msg) __attribute__((deprecated))
311
+ #elif (defined(__GNUC__))
312
+ # define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
313
+ #else
314
+ # define __DEPRECATED__(msg)
315
+ #endif
316
+
317
+ /**
318
+ * \ingroup compilation
319
+ * \brief
320
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
321
+ * nvrtcGetLTOIRSize (and nvrtcGetLTOIR) instead.
322
+ */
323
+ __DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIRSize instead")
324
+ nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
325
+
326
+ /**
327
+ * \ingroup compilation
328
+ * \brief
329
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
330
+ * nvrtcGetLTOIR (and nvrtcGetLTOIRSize) instead.
331
+ */
332
+ __DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIR instead")
333
+ nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
334
+
335
+ #undef __DEPRECATED__
336
+
337
+ /**
338
+ * \ingroup compilation
339
+ * \brief nvrtcGetLTOIRSize sets the value of \p LTOIRSizeRet with the size of the LTO IR
340
+ * generated by the previous compilation of \p prog. The value of
341
+ * LTOIRSizeRet is set to 0 if the program was not compiled with
342
+ * \c -dlto.
343
+ *
344
+ * \param [in] prog CUDA Runtime Compilation program.
345
+ * \param [out] LTOIRSizeRet Size of the generated LTO IR.
346
+ * \return
347
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
348
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
349
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
350
+ *
351
+ * \see ::nvrtcGetLTOIR
352
+ */
353
+ nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *LTOIRSizeRet);
354
+
355
+
356
+ /**
357
+ * \ingroup compilation
358
+ * \brief nvrtcGetLTOIR stores the LTO IR generated by the previous compilation
359
+ * of \p prog in the memory pointed by \p LTOIR. No LTO IR is available
360
+ * if the program was compiled without \c -dlto.
361
+ *
362
+ * \param [in] prog CUDA Runtime Compilation program.
363
+ * \param [out] LTOIR Compiled result.
364
+ * \return
365
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
366
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
367
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
368
+ *
369
+ * \see ::nvrtcGetLTOIRSize
370
+ */
371
+ nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *LTOIR);
372
+
373
+
374
+ /**
375
+ * \ingroup compilation
376
+ * \brief nvrtcGetOptiXIRSize sets the value of \p optixirSizeRet with the size of the OptiX IR
377
+ * generated by the previous compilation of \p prog. The value of
378
+ * nvrtcGetOptiXIRSize is set to 0 if the program was compiled with
379
+ * options incompatible with OptiX IR generation.
380
+ *
381
+ * \param [in] prog CUDA Runtime Compilation program.
382
+ * \param [out] optixirSizeRet Size of the generated LTO IR.
383
+ * \return
384
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
385
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
386
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
387
+ *
388
+ * \see ::nvrtcGetOptiXIR
389
+ */
390
+ nvrtcResult nvrtcGetOptiXIRSize(nvrtcProgram prog, size_t *optixirSizeRet);
391
+
392
+
393
+ /**
394
+ * \ingroup compilation
395
+ * \brief nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation
396
+ * of \p prog in the memory pointed by \p optixir. No OptiX IR is available
397
+ * if the program was compiled with options incompatible with OptiX IR generation.
398
+ *
399
+ * \param [in] prog CUDA Runtime Compilation program.
400
+ * \param [out] optixir Optix IR Compiled result.
401
+ * \return
402
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
403
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
404
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
405
+ *
406
+ * \see ::nvrtcGetOptiXIRSize
407
+ */
408
+ nvrtcResult nvrtcGetOptiXIR(nvrtcProgram prog, char *optixir);
409
+
410
+ /**
411
+ * \ingroup compilation
412
+ * \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
413
+ * log generated by the previous compilation of \p prog (including the
414
+ * trailing \c NULL).
415
+ *
416
+ * Note that compilation log may be generated with warnings and informative
417
+ * messages, even when the compilation of \p prog succeeds.
418
+ *
419
+ * \param [in] prog CUDA Runtime Compilation program.
420
+ * \param [out] logSizeRet Size of the compilation log
421
+ * (including the trailing \c NULL).
422
+ * \return
423
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
424
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
425
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
426
+ *
427
+ * \see ::nvrtcGetProgramLog
428
+ */
429
+ nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
430
+
431
+
432
+ /**
433
+ * \ingroup compilation
434
+ * \brief nvrtcGetProgramLog stores the log generated by the previous
435
+ * compilation of \p prog in the memory pointed by \p log.
436
+ *
437
+ * \param [in] prog CUDA Runtime Compilation program.
438
+ * \param [out] log Compilation log.
439
+ * \return
440
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
441
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
442
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
443
+ *
444
+ * \see ::nvrtcGetProgramLogSize
445
+ */
446
+ nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
447
+
448
+
449
+ /**
450
+ * \ingroup compilation
451
+ * \brief nvrtcAddNameExpression notes the given name expression
452
+ * denoting the address of a __global__ function
453
+ * or __device__/__constant__ variable.
454
+ *
455
+ * The identical name expression string must be provided on a subsequent
456
+ * call to nvrtcGetLoweredName to extract the lowered name.
457
+ * \param [in] prog CUDA Runtime Compilation program.
458
+ * \param [in] name_expression constant expression denoting the address of
459
+ * a __global__ function or __device__/__constant__ variable.
460
+ * \return
461
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
462
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
463
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
464
+ * - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
465
+ *
466
+ * \see ::nvrtcGetLoweredName
467
+ */
468
+ nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
469
+ const char * const name_expression);
470
+
471
+ /**
472
+ * \ingroup compilation
473
+ * \brief nvrtcGetLoweredName extracts the lowered (mangled) name
474
+ * for a __global__ function or __device__/__constant__ variable,
475
+ * and updates *lowered_name to point to it. The memory containing
476
+ * the name is released when the NVRTC program is destroyed by
477
+ * nvrtcDestroyProgram.
478
+ * The identical name expression must have been previously
479
+ * provided to nvrtcAddNameExpression.
480
+ *
481
+ * \param [in] prog CUDA Runtime Compilation program.
482
+ * \param [in] name_expression constant expression denoting the address of
483
+ * a __global__ function or __device__/__constant__ variable.
484
+ * \param [out] lowered_name initialized by the function to point to a
485
+ * C string containing the lowered (mangled)
486
+ * name corresponding to the provided name expression.
487
+ * \return
488
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
489
+ * - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
490
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
491
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
492
+ * - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
493
+ *
494
+ * \see ::nvrtcAddNameExpression
495
+ */
496
+ nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
497
+ const char *const name_expression,
498
+ const char** lowered_name);
499
+
500
+
501
+ /*************************************************************************//**
502
+ *
503
+ * \defgroup precompiled_header Precompiled header (PCH) (CUDA 12.8+)
504
+ *
505
+ * NVRTC defines the following function related to PCH. Also see PCH related
506
+ * flags passed to nvrtcCompileProgram.
507
+ ****************************************************************************/
508
+
509
+
510
+ /**
511
+ * \ingroup precompiled_header
512
+ * \brief retrieve the current size of the PCH Heap.
513
+ *
514
+ * \param [out] ret pointer to location where the size of the PCH Heap
515
+ * will be stored
516
+ * \return
517
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
518
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
519
+ *
520
+ */
521
+ nvrtcResult nvrtcGetPCHHeapSize(size_t* ret);
522
+
523
+ /**
524
+ * \ingroup precompiled_header
525
+ * \brief set the size of the PCH Heap.
526
+ *
527
+ * \param [in] size requested size of the PCH Heap, in bytes
528
+ *
529
+ * \return
530
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
531
+ *
532
+ * The requested size may be rounded up to a platform dependent
533
+ * alignment (e.g. page size). If the PCH Heap has already been allocated,
534
+ * the heap memory will be freed and a new PCH Heap will be allocated.
535
+ */
536
+ nvrtcResult nvrtcSetPCHHeapSize(size_t size);
537
+
538
+ /**
539
+ * \ingroup precompiled_header
540
+ * \brief returns the PCH creation status.
541
+ *
542
+ * \param [in] prog CUDA Runtime Compilation program.
543
+ *
544
+ * \return
545
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
546
+ * - \link #nvrtcResult NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED \endlink
547
+ * - \link #nvrtcResult NVRTC_ERROR_PCH_CREATE \endlink
548
+ * - \link #nvrtcResult NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED \endlink
549
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
550
+ *
551
+ * NVRTC_SUCCESS indicates that the PCH was successfully created.
552
+ * NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED indicates that no PCH creation
553
+ * was attempted, either because PCH functionality was not requested during
554
+ * the preceding nvrtcCompileProgram call, or automatic PCH processing was
555
+ * requested, and compiler chose not to create a PCH file.
556
+ * NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED indicates that a PCH file could
557
+ * potentially have been created, but the compiler ran out space in the PCH
558
+ * heap. In this scenario, the nvrtcGetPCHHeapSizeRequired() can be used to
559
+ * query the required heap size, the heap can be reallocated for this size with
560
+ * nvrtcSetPCHHeapSize() and PCH creation may be reattempted again invoking
561
+ * nvrtcCompileProgram() with a new NVRTC program instance.
562
+ * NVRTC_ERROR_PCH_CREATE indicates that an error condition prevented the
563
+ * PCH file from being created.
564
+ */
565
+ nvrtcResult nvrtcGetPCHCreateStatus(nvrtcProgram prog);
566
+
567
+ /**
568
+ * \ingroup precompiled_header
569
+ * \brief retrieve the required size of the PCH heap required to compile
570
+ * the given program.
571
+ *
572
+ * \param [in] prog CUDA Runtime Compilation program.
573
+ * \param [out] size pointer to location where the required size of the PCH Heap
574
+ * will be stored
575
+ *
576
+ * \return
577
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
578
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
579
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
580
+ * The size retrieved using this function is only valid if nvrtcGetPCHCreateStatus()
581
+ * returned NVRTC_SUCCESS or NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED
582
+ */
583
+ nvrtcResult nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size);
584
+
585
+ /**
586
+ * \ingroup compilation
587
+ * \brief nvrtcSetFlowCallback registers a callback function that the compiler
588
+ * will invoke at different points during a call to nvrtcCompileProgram,
589
+ * and the callback function can decide whether to cancel compilation by
590
+ * returning specific values.
591
+ *
592
+ * The callback function must satisfy the following constraints:
593
+ *
594
+ * (1) Its signature should be:
595
+ * @code
596
+ * int callback(void* param1, void* param2);
597
+ * @endcode
598
+ * When invoking the callback, the compiler will always pass \p payload to
599
+ * param1 so that the callback may make decisions based on \p payload . It'll
600
+ * always pass NULL to param2 for now which is reserved for future extensions.
601
+ *
602
+ * (2) It must return 1 to cancel compilation or 0 to continue.
603
+ * Other return values are reserved for future use.
604
+ *
605
+ * (3) It must return consistent values. Once it returns 1 at one point, it must
606
+ * return 1 in all following invocations during the current nvrtcCompileProgram
607
+ * call in progress.
608
+ *
609
+ * (4) It must be thread-safe.
610
+ *
611
+ * (5) It must not invoke any nvrtc/libnvvm/ptx APIs.
612
+ *
613
+ * \param [in] prog CUDA Runtime Compilation program.
614
+ * \param [in] callback the callback that issues cancellation signal.
615
+ * \param [in] payload to be passed as a parameter when invoking the callback.
616
+ * \return
617
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
618
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
619
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
620
+ */
621
+ nvrtcResult nvrtcSetFlowCallback(nvrtcProgram prog, int (*callback)(void*, void*), void *payload);
622
+
623
+ /**
624
+ * \defgroup options Supported Compile Options
625
+ *
626
+ * NVRTC supports the compile options below.
627
+ * Option names with two preceding dashs (\c --) are long option names and
628
+ * option names with one preceding dash (\c -) are short option names.
629
+ * Short option names can be used instead of long option names.
630
+ * When a compile option takes an argument, an assignment operator (\c =)
631
+ * is used to separate the compile option argument from the compile option
632
+ * name, e.g., \c "--gpu-architecture=compute_60".
633
+ * Alternatively, the compile option name and the argument can be specified in
634
+ * separate strings without an assignment operator, .e.g,
635
+ * \c "--gpu-architecture" \c "compute_60".
636
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
637
+ * not require an assignment operator, and the compile option name and the
638
+ * argument can be present in the same string with or without spaces between
639
+ * them.
640
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
641
+ * supported.
642
+ *
643
+ * The valid compiler options are:
644
+ *
645
+ * - Compilation targets
646
+ * - \c --gpu-architecture=\<arch\> (\c -arch)
647
+ *
648
+ * Specify the name of the class of GPU architectures for which the
649
+ * input must be compiled.\n
650
+ * - Valid <c>\<arch\></c>s:
651
+ * - \c compute_50
652
+ * - \c compute_52
653
+ * - \c compute_53
654
+ * - \c compute_60
655
+ * - \c compute_61
656
+ * - \c compute_62
657
+ * - \c compute_70
658
+ * - \c compute_72
659
+ * - \c compute_75
660
+ * - \c compute_80
661
+ * - \c compute_87
662
+ * - \c compute_89
663
+ * - \c compute_90
664
+ * - \c compute_90a
665
+ * - \c compute_100
666
+ * - \c compute_100a
667
+ * - \c sm_50
668
+ * - \c sm_52
669
+ * - \c sm_53
670
+ * - \c sm_60
671
+ * - \c sm_61
672
+ * - \c sm_62
673
+ * - \c sm_70
674
+ * - \c sm_72
675
+ * - \c sm_75
676
+ * - \c sm_80
677
+ * - \c sm_87
678
+ * - \c sm_89
679
+ * - \c sm_90
680
+ * - \c sm_90a
681
+ * - \c sm_100
682
+ * - \c sm_100a
683
+ * - Default: \c compute_52
684
+ * - Separate compilation / whole-program compilation
685
+ * - \c --device-c (\c -dc)
686
+ *
687
+ * Generate relocatable code that can be linked with other relocatable
688
+ * device code. It is equivalent to \c --relocatable-device-code=true.
689
+ * - \c --device-w (\c -dw)
690
+ *
691
+ * Generate non-relocatable code. It is equivalent to \c --relocatable-device-code=false.
692
+ * - \c --relocatable-device-code={true|false} (\c -rdc)
693
+ *
694
+ * Enable (disable) the generation of relocatable device code.
695
+ * - Default: \c false
696
+ * - \c --extensible-whole-program (\c -ewp)
697
+ *
698
+ * Do extensible whole program compilation of device code.
699
+ * - Default: \c false
700
+ * - Debugging support
701
+ * - \c --device-debug (\c -G)
702
+ *
703
+ * Generate debug information. If \c --dopt is not specified, then turns off all optimizations.
704
+ * - \c --generate-line-info (\c -lineinfo)
705
+ *
706
+ * Generate line-number information.
707
+ * - Code generation
708
+ * - \c --dopt \c on (\c -dopt)
709
+ *
710
+ * - \c --dopt=on
711
+ *
712
+ * Enable device code optimization. When specified along with \c -G, enables
713
+ * limited debug information generation for optimized device code (currently,
714
+ * only line number information). When \c -G is not specified, \c -dopt=on is implicit.
715
+ *
716
+ * - \c --ptxas-options \<options\> (\c -Xptxas)
717
+ *
718
+ * - \c --ptxas-options=\<options\>
719
+ *
720
+ * Specify options directly to ptxas, the PTX optimizing assembler.
721
+ * - \c --maxrregcount=\<N\> (\c -maxrregcount)
722
+ *
723
+ * Specify the maximum amount of registers that GPU functions can use.
724
+ * Until a function-specific limit, a higher value will generally
725
+ * increase the performance of individual GPU threads that execute this
726
+ * function. However, because thread registers are allocated from a
727
+ * global register pool on each GPU, a higher value of this option will
728
+ * also reduce the maximum thread block size, thereby reducing the amount
729
+ * of thread parallelism. Hence, a good maxrregcount value is the result
730
+ * of a trade-off. If this option is not specified, then no maximum is
731
+ * assumed. Value less than the minimum registers required by ABI will
732
+ * be bumped up by the compiler to ABI minimum limit.
733
+ *
734
+ * - \c --ftz={true|false} (\c -ftz)
735
+ *
736
+ * When performing single-precision floating-point operations, flush
737
+ * denormal values to zero or preserve denormal values.
738
+ *
739
+ * \c --use_fast_math implies \c --ftz=true.
740
+ * - Default: \c false
741
+ *
742
+ * - \c --prec-sqrt={true|false} (\c -prec-sqrt)
743
+ *
744
+ * For single-precision floating-point square root, use IEEE
745
+ * round-to-nearest mode or use a faster approximation.
746
+ * \c --use_fast_math implies \c --prec-sqrt=false.
747
+ * - Default: \c true
748
+ *
749
+ * - \c --prec-div={true|false} (\c -prec-div)
750
+ * For single-precision floating-point division and reciprocals, use IEEE
751
+ * round-to-nearest mode or use a faster approximation.
752
+ * \c --use_fast_math implies \c --prec-div=false.
753
+ * - Default: \c true
754
+ *
755
+ * - \c --fmad={true|false} (\c -fmad)
756
+ *
757
+ * Enables (disables) the contraction of floating-point multiplies and
758
+ * adds/subtracts into floating-point multiply-add operations (FMAD,
759
+ * FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
760
+ * - Default: \c true
761
+ *
762
+ * - \c --use_fast_math (\c -use_fast_math)
763
+ *
764
+ * Make use of fast math operations.
765
+ * \c --use_fast_math implies \c --ftz=true \c --prec-div=false
766
+ * \c --prec-sqrt=false \c --fmad=true.
767
+ *
768
+ * - \c --extra-device-vectorization (\c -extra-device-vectorization)
769
+ *
770
+ * Enables more aggressive device code vectorization in the NVVM optimizer.
771
+ *
772
+ * - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)
773
+ *
774
+ * On Linux, during compilation, use \c setrlimit() to increase stack size
775
+ * to maximum allowed. The limit is reset to the previous value at the
776
+ * end of compilation.
777
+ * Note: \c setrlimit() changes the value for the entire process.
778
+ * - Default: \c true
779
+ *
780
+ * - \c --dlink-time-opt (\c -dlto)
781
+ *
782
+ * Generate intermediate code for later link-time optimization.
783
+ * It implies \c -rdc=true.
784
+ * Note: when this option is used the \c nvrtcGetLTOIR API should be used,
785
+ * as PTX or Cubin will not be generated.
786
+ *
787
+ * - \c --gen-opt-lto (\c -gen-opt-lto)
788
+ *
789
+ * Run the optimizer passes before generating the LTO IR.
790
+ *
791
+ * - \c --optix-ir (\c -optix-ir)
792
+ *
793
+ * Generate OptiX IR. The Optix IR is only intended for consumption by OptiX
794
+ * through appropriate APIs. This feature is not supported with
795
+ * link-time-optimization (\c -dlto).
796
+ *
797
+ * Note: when this option is used the nvrtcGetOptiX API should be used,
798
+ * as PTX or Cubin will not be generated.
799
+ *
800
+ * - \c --jump-table-density=[0-101] (\c -jtd)
801
+ *
802
+ * Specify the case density percentage in switch statements, and use it as
803
+ * a minimal threshold to determine whether jump table(brx.idx instruction)
804
+ * will be used to implement a switch statement. Default value is 101. The
805
+ * percentage ranges from 0 to 101 inclusively.
806
+ *
807
+ * - \c --device-stack-protector={true|false} (\c -device-stack-protector)
808
+ *
809
+ * Enable (disable) the generation of stack canaries in device code.
810
+ *
811
+ * - Default: \c false
812
+ *
813
+ * - Preprocessing
814
+ * - \c --define-macro=\<def\> (\c -D)
815
+ *
816
+ * \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
817
+ * - \c \<name\>
818
+ *
819
+ * Predefine \c \<name\> as a macro with definition \c 1.
820
+ * - \c \<name\>=\<definition\>
821
+ *
822
+ * The contents of \c \<definition\> are tokenized and preprocessed
823
+ * as if they appeared during translation phase three in a \c \#define
824
+ * directive. In particular, the definition will be truncated by
825
+ * embedded new line characters.
826
+ *
827
+ * - \c --undefine-macro=\<def\> (\c -U)
828
+ *
829
+ * Cancel any previous definition of \c \<def\>.
830
+ *
831
+ * - \c --include-path=\<dir\> (\c -I)
832
+ *
833
+ * Add the directory \c \<dir\> to the list of directories to be
834
+ * searched for headers. These paths are searched after the list of
835
+ * headers given to ::nvrtcCreateProgram.
836
+ *
837
+ * - \c --pre-include=\<header\> (\c -include)
838
+ *
839
+ * Preinclude \c \<header\> during preprocessing.
840
+ *
841
+ * - \c --no-source-include (\c -no-source-include)
842
+ *
843
+ * The preprocessor by default adds the directory of each input sources
844
+ * to the include path. This option disables this feature and only
845
+ * considers the path specified explicitly.
846
+ *
847
+ * - Language Dialect
848
+ * - \c --std={c++03|c++11|c++14|c++17|c++20} (\c -std)
849
+ *
850
+ * Set language dialect to C++03, C++11, C++14, C++17 or C++20
851
+ * - Default: \c c++17
852
+ *
853
+ * - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)
854
+ *
855
+ * Provide builtin definitions of \c std::move and \c std::forward,
856
+ * when C++11 or later language dialect is selected.
857
+ * - Default: \c true
858
+ *
859
+ * - \c --builtin-initializer-list={true|false}
860
+ * (\c -builtin-initializer-list)
861
+ *
862
+ * Provide builtin definitions of \c std::initializer_list class and
863
+ * member functions when C++11 or later language dialect is selected.
864
+ * - Default: \c true
865
+ *
866
+ * - Precompiled header support (CUDA 12.8+)
867
+ * - \c --pch (\c -pch)
868
+ *
869
+ * Enable automatic PCH processing.
870
+ *
871
+ * - \c --create-pch=<file-name> (\c -create-pch)
872
+ *
873
+ * Create a PCH file.
874
+ *
875
+ * - \c --use-pch=<file-name> (\c -use-pch)
876
+ *
877
+ * Use the specified PCH file.
878
+ *
879
+ * - \c --pch-dir=<directory-name> (\c -pch-dir)
880
+ *
881
+ * When using automatic PCH (\c -pch), look for and create PCH files in the
882
+ * specified directory. When using explicit PCH (\c -create-pch or \c -use-pch),
883
+ * the directory name is prefixed before the specified file name, unless
884
+ * the file name is an absolute path name.
885
+ *
886
+ * - \c --pch-verbose={true|false} (\c -pch-verbose)
887
+ *
888
+ * In automatic PCH mode, for each PCH file that could not be used in current
889
+ * compilation, print the reason in the compilation log.
890
+ * - Default: \c true
891
+ *
892
+ * - \c --pch-messages={true|false} (\c -pch-messages)
893
+ *
894
+ * Print a message in the compilation log, if a PCH file was created or used
895
+ * in the current compilation.
896
+ * - Default: \c true
897
+ *
898
+ * - \c --instantiate-templates-in-pch={true|false} (\c -instantiate-templates-in-pch)
899
+ *
900
+ * Enable or disable instantiatiation of templates before PCH creation. Instantiating
901
+ * templates may increase the size of the PCH file, while reducing the compilation
902
+ * cost when using the PCH file (since some template instantiations can be skipped).
903
+ * - Default: \c true
904
+ *
905
+ * - Misc.
906
+ * - \c --disable-warnings (\c -w)
907
+ *
908
+ * Inhibit all warning messages.
909
+ *
910
+ * - \c --restrict (\c -restrict)
911
+ *
912
+ * Programmer assertion that all kernel pointer parameters are restrict
913
+ * pointers.
914
+ *
915
+ * - \c --device-as-default-execution-space
916
+ * (\c -default-device)
917
+ *
918
+ * Treat entities with no execution space annotation as \c __device__
919
+ * entities.
920
+ *
921
+ * - \c --device-int128 (\c -device-int128)
922
+ *
923
+ * Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
924
+ * to be defined.
925
+ *
926
+ * - \c --device-float128 (\c -device-float128)
927
+ *
928
+ * Allow the \c __float128 and \c _Float128 types in device code. Also
929
+ * causes the macro \c D__CUDACC_RTC_FLOAT128__ to be defined.
930
+ *
931
+ * - \c --optimization-info=\<kind\> (\c -opt-info)
932
+ *
933
+ * Provide optimization reports for the specified kind of optimization.
934
+ * The following kind tags are supported:
935
+ * - \c inline : emit a remark when a function is inlined.
936
+ *
937
+ * - \c --display-error-number (\c -err-no)
938
+ *
939
+ * Display diagnostic number for warning messages. (Default)
940
+ *
941
+ * - \c --no-display-error-number (\c -no-err-no)
942
+ *
943
+ * Disables the display of a diagnostic number for warning messages.
944
+ *
945
+ * - \c --diag-error=<error-number>,... (\c -diag-error)
946
+ *
947
+ * Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
948
+ *
949
+ * - \c --diag-suppress=<error-number>,... (\c -diag-suppress)
950
+ *
951
+ * Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
952
+ *
953
+ * - \c --diag-warn=<error-number>,... (\c -diag-warn)
954
+ *
955
+ * Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
956
+ *
957
+ * - \c --brief-diagnostics={true|false} (\c -brief-diag)
958
+ *
959
+ * This option disables or enables showing source line and column info
960
+ * in a diagnostic.
961
+ * The \c --brief-diagnostics=true will not show the source line and column info.
962
+ * - Default: \c false
963
+ *
964
+ * - \c --time=<file-name> (\c -time)
965
+ *
966
+ * Generate a comma separated value table with the time taken by each compilation
967
+ * phase, and append it at the end of the file given as the option argument.
968
+ * If the file does not exist, the column headings are generated in the first row
969
+ * of the table. If the file name is '-', the timing data is written to the compilation log.
970
+ *
971
+ * - \c --split-compile=<number-of-threads> (\c -split-compile=<number-of-threads>)
972
+ *
973
+ * Perform compiler optimizations in parallel.
974
+ * Split compilation attempts to reduce compile time by enabling the compiler to run certain
975
+ * optimization passes concurrently. This option accepts a numerical value that specifies the
976
+ * maximum number of threads the compiler can use. One can also allow the compiler to use the maximum
977
+ * threads available on the system by setting \c --split-compile=0.
978
+ * Setting \c --split-compile=1 will cause this option to be ignored.
979
+ *
980
+ * - \c --fdevice-syntax-only (\c -fdevice-syntax-only)
981
+ *
982
+ * Ends device compilation after front-end syntax checking. This option does not generate valid
983
+ * device code.
984
+ *
985
+ * - \c --minimal (\c -minimal)
986
+ *
987
+ * Omit certain language features to reduce compile time for small programs.
988
+ * In particular, the following are omitted:
989
+ * - Texture and surface functions and associated types, e.g., \c cudaTextureObject_t.
990
+ * - CUDA Runtime Functions that are provided by the cudadevrt device code library,
991
+ * typically named with prefix "cuda", e.g., \c cudaMalloc.
992
+ * - Kernel launch from device code.
993
+ * - Types and macros associated with CUDA Runtime and Driver APIs,
994
+ * provided by \c cuda/tools/cudart/driver_types.h, typically named with prefix "cuda", e.g., \c cudaError_t.
995
+ *
996
+ * - \c --device-stack-protector (\c -device-stack-protector)
997
+ *
998
+ * Enable stack canaries in device code.
999
+ * Stack canaries make it more difficult to exploit certain types of memory safety bugs involving
1000
+ * stack-local variables. The compiler uses heuristics to assess the risk of such a bug in each function.
1001
+ * Only those functions which are deemed high-risk make use of a stack canary.
1002
+ *
1003
+ * - \c --fdevice-time-trace=<file-name> (\c -fdevice-time-trace=<file-name>)
1004
+ * Enables the time profiler, outputting a JSON file based on given <file-name>. Results can be analyzed on
1005
+ * chrome://tracing for a flamegraph visualization.
1006
+ *
1007
+ */
1008
+
1009
+ #ifdef __cplusplus
1010
+ }
1011
+ #endif /* __cplusplus */
1012
+
1013
+
1014
+ /* The utility function 'nvrtcGetTypeName' is not available by default. Define
1015
+ the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
1016
+ */
1017
+
1018
+ #if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
1019
+
1020
+ #if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
1021
+ #include <cxxabi.h>
1022
+ #include <cstdlib>
1023
+
1024
+ #elif defined(_WIN32)
1025
+ #include <Windows.h>
1026
+ #include <DbgHelp.h>
1027
+ #endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
1028
+
1029
+
1030
+ #include <string>
1031
+ #include <typeinfo>
1032
+
1033
+ template <typename T> struct __nvrtcGetTypeName_helper_t { };
1034
+
1035
+ /*************************************************************************//**
1036
+ *
1037
+ * \defgroup hosthelper Host Helper
1038
+ *
1039
+ * NVRTC defines the following functions for easier interaction with host code.
1040
+ *
1041
+ ****************************************************************************/
1042
+
1043
+ /**
1044
+ * \ingroup hosthelper
1045
+ * \brief nvrtcGetTypeName stores the source level name of a type in the given
1046
+ * std::string location.
1047
+ *
1048
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
1049
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
1050
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
1051
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
1052
+ * otherwise *result is initialized with the extracted name.
1053
+ *
1054
+ * Windows-specific notes:
1055
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
1056
+ * which is not multi-thread safe.
1057
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
1058
+ *
1059
+ * \param [in] tinfo: reference to object of type std::type_info for a given type.
1060
+ * \param [in] result: pointer to std::string in which to store the type name.
1061
+ * \return
1062
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
1063
+ * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
1064
+ *
1065
+ */
1066
+ inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
1067
+ {
1068
+ #if USE_CXXABI || __clang__ || __GNUC__
1069
+ const char *name = tinfo.name();
1070
+ int status;
1071
+ char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
1072
+ if (status == 0) {
1073
+ *result = undecorated_name;
1074
+ free(undecorated_name);
1075
+ return NVRTC_SUCCESS;
1076
+ }
1077
+ #elif defined(_WIN32)
1078
+ const char *name = tinfo.raw_name();
1079
+ if (!name || *name != '.') {
1080
+ return NVRTC_ERROR_INTERNAL_ERROR;
1081
+ }
1082
+ char undecorated_name[4096];
1083
+ //name+1 skips over the '.' prefix
1084
+ if(UnDecorateSymbolName(name+1, undecorated_name,
1085
+ sizeof(undecorated_name) / sizeof(*undecorated_name),
1086
+ //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
1087
+ UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
1088
+ *result = undecorated_name;
1089
+ return NVRTC_SUCCESS;
1090
+ }
1091
+ #endif /* USE_CXXABI || __clang__ || __GNUC__ */
1092
+
1093
+ return NVRTC_ERROR_INTERNAL_ERROR;
1094
+ }
1095
+
1096
+ /**
1097
+ * \ingroup hosthelper
1098
+ * \brief nvrtcGetTypeName stores the source level name of the template type argument
1099
+ * T in the given std::string location.
1100
+ *
1101
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
1102
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
1103
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
1104
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
1105
+ * otherwise *result is initialized with the extracted name.
1106
+ *
1107
+ * Windows-specific notes:
1108
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
1109
+ * which is not multi-thread safe.
1110
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
1111
+ *
1112
+ * \param [in] result: pointer to std::string in which to store the type name.
1113
+ * \return
1114
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
1115
+ * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
1116
+ *
1117
+ */
1118
+
1119
+ template <typename T>
1120
+ nvrtcResult nvrtcGetTypeName(std::string *result)
1121
+ {
1122
+ nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
1123
+ result);
1124
+ if (res != NVRTC_SUCCESS)
1125
+ return res;
1126
+
1127
+ std::string repr = *result;
1128
+ std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
1129
+ idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
1130
+ std::size_t last_idx = repr.find_last_of('>');
1131
+ if (idx == std::string::npos || last_idx == std::string::npos) {
1132
+ return NVRTC_ERROR_INTERNAL_ERROR;
1133
+ }
1134
+ ++idx;
1135
+ *result = repr.substr(idx, last_idx - idx);
1136
+ return NVRTC_SUCCESS;
1137
+ }
1138
+
1139
+ #endif /* NVRTC_GET_TYPE_NAME */
1140
+
1141
+ #endif /* __NVRTC_H__ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (227 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (225 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (233 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/builtin_types.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*******************************************************************************
51
+ * *
52
+ * *
53
+ * *
54
+ *******************************************************************************/
55
+
56
+ #include "device_types.h"
57
+ #if !defined(__CUDACC_RTC__)
58
+ #define EXCLUDE_FROM_RTC
59
+ #include "driver_types.h"
60
+ #undef EXCLUDE_FROM_RTC
61
+ #endif /* !__CUDACC_RTC__ */
62
+ #include "surface_types.h"
63
+ #include "texture_types.h"
64
+ #include "vector_types.h"
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CHANNEL_DESCRIPTOR_H__)
51
+ #define __CHANNEL_DESCRIPTOR_H__
52
+
53
+ #if defined(__cplusplus)
54
+
55
+ /*******************************************************************************
56
+ * *
57
+ * *
58
+ * *
59
+ *******************************************************************************/
60
+
61
+ #include "cuda_runtime_api.h"
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ /**
70
+ * \addtogroup CUDART_HIGHLEVEL
71
+ *
72
+ * @{
73
+ */
74
+
75
+ /**
76
+ * \brief \hl Returns a channel descriptor using the specified format
77
+ *
78
+ * Returns a channel descriptor with format \p f and number of bits of each
79
+ * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
80
+ * defined as:
81
+ * \code
82
+ struct cudaChannelFormatDesc {
83
+ int x, y, z, w;
84
+ enum cudaChannelFormatKind f;
85
+ };
86
+ * \endcode
87
+ *
88
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
89
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
90
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
91
+ * ::cudaChannelFormatKindSignedNormalized8X4,
92
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
93
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
94
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
95
+ * ::cudaChannelFormatKindSignedNormalized16X4,
96
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
97
+ * ::cudaChannelFormatKindUnsignedNormalized16X4,
98
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
99
+ * or ::cudaChannelFormatKindNV12.
100
+ *
101
+ * The format is specified by the template specialization.
102
+ *
103
+ * The template function specializes for the following scalar types:
104
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
105
+ * The template function specializes for the following vector types:
106
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
107
+ * The template function specializes for following cudaChannelFormatKind enum values:
108
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4},
109
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
110
+ * and ::cudaChannelFormatKindNV12.
111
+ *
112
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
113
+ *
114
+ * \return
115
+ * Channel descriptor with format \p f
116
+ *
117
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
118
+ * ::cudaGetChannelDesc,
119
+ */
120
+ template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
121
+ {
122
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
123
+ }
124
+
125
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
126
+ {
127
+ int e = (int)sizeof(unsigned short) * 8;
128
+
129
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
130
+ }
131
+
132
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
133
+ {
134
+ int e = (int)sizeof(unsigned short) * 8;
135
+
136
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
137
+ }
138
+
139
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
140
+ {
141
+ int e = (int)sizeof(unsigned short) * 8;
142
+
143
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
144
+ }
145
+
146
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
147
+ {
148
+ int e = (int)sizeof(unsigned short) * 8;
149
+
150
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
151
+ }
152
+
153
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
154
+ {
155
+ int e = (int)sizeof(char) * 8;
156
+
157
+ #if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
158
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
159
+ #else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
160
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
161
+ #endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
162
+ }
163
+
164
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
165
+ {
166
+ int e = (int)sizeof(signed char) * 8;
167
+
168
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
169
+ }
170
+
171
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
172
+ {
173
+ int e = (int)sizeof(unsigned char) * 8;
174
+
175
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
176
+ }
177
+
178
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
179
+ {
180
+ int e = (int)sizeof(signed char) * 8;
181
+
182
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
183
+ }
184
+
185
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
186
+ {
187
+ int e = (int)sizeof(unsigned char) * 8;
188
+
189
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
190
+ }
191
+
192
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
193
+ {
194
+ int e = (int)sizeof(signed char) * 8;
195
+
196
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
197
+ }
198
+
199
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
200
+ {
201
+ int e = (int)sizeof(unsigned char) * 8;
202
+
203
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
204
+ }
205
+
206
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
207
+ {
208
+ int e = (int)sizeof(signed char) * 8;
209
+
210
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
211
+ }
212
+
213
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
214
+ {
215
+ int e = (int)sizeof(unsigned char) * 8;
216
+
217
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
218
+ }
219
+
220
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
221
+ {
222
+ int e = (int)sizeof(short) * 8;
223
+
224
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
225
+ }
226
+
227
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
228
+ {
229
+ int e = (int)sizeof(unsigned short) * 8;
230
+
231
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
232
+ }
233
+
234
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
235
+ {
236
+ int e = (int)sizeof(short) * 8;
237
+
238
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
239
+ }
240
+
241
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
242
+ {
243
+ int e = (int)sizeof(unsigned short) * 8;
244
+
245
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
246
+ }
247
+
248
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
249
+ {
250
+ int e = (int)sizeof(short) * 8;
251
+
252
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
253
+ }
254
+
255
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
256
+ {
257
+ int e = (int)sizeof(unsigned short) * 8;
258
+
259
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
260
+ }
261
+
262
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
263
+ {
264
+ int e = (int)sizeof(short) * 8;
265
+
266
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
267
+ }
268
+
269
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
270
+ {
271
+ int e = (int)sizeof(unsigned short) * 8;
272
+
273
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
274
+ }
275
+
276
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
277
+ {
278
+ int e = (int)sizeof(int) * 8;
279
+
280
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
281
+ }
282
+
283
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
284
+ {
285
+ int e = (int)sizeof(unsigned int) * 8;
286
+
287
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
288
+ }
289
+
290
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
291
+ {
292
+ int e = (int)sizeof(int) * 8;
293
+
294
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
295
+ }
296
+
297
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
298
+ {
299
+ int e = (int)sizeof(unsigned int) * 8;
300
+
301
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
302
+ }
303
+
304
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
305
+ {
306
+ int e = (int)sizeof(int) * 8;
307
+
308
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
309
+ }
310
+
311
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
312
+ {
313
+ int e = (int)sizeof(unsigned int) * 8;
314
+
315
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
316
+ }
317
+
318
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
319
+ {
320
+ int e = (int)sizeof(int) * 8;
321
+
322
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
323
+ }
324
+
325
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
326
+ {
327
+ int e = (int)sizeof(unsigned int) * 8;
328
+
329
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
330
+ }
331
+
332
+ #if !defined(__LP64__)
333
+
334
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
335
+ {
336
+ int e = (int)sizeof(long) * 8;
337
+
338
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
339
+ }
340
+
341
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
342
+ {
343
+ int e = (int)sizeof(unsigned long) * 8;
344
+
345
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
346
+ }
347
+
348
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
349
+ {
350
+ int e = (int)sizeof(long) * 8;
351
+
352
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
353
+ }
354
+
355
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
356
+ {
357
+ int e = (int)sizeof(unsigned long) * 8;
358
+
359
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
360
+ }
361
+
362
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
363
+ {
364
+ int e = (int)sizeof(long) * 8;
365
+
366
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
367
+ }
368
+
369
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
370
+ {
371
+ int e = (int)sizeof(unsigned long) * 8;
372
+
373
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
374
+ }
375
+
376
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
377
+ {
378
+ int e = (int)sizeof(long) * 8;
379
+
380
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
381
+ }
382
+
383
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
384
+ {
385
+ int e = (int)sizeof(unsigned long) * 8;
386
+
387
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
388
+ }
389
+
390
+ #endif /* !__LP64__ */
391
+
392
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
393
+ {
394
+ int e = (int)sizeof(float) * 8;
395
+
396
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
397
+ }
398
+
399
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
400
+ {
401
+ int e = (int)sizeof(float) * 8;
402
+
403
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
404
+ }
405
+
406
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
407
+ {
408
+ int e = (int)sizeof(float) * 8;
409
+
410
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
411
+ }
412
+
413
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
414
+ {
415
+ int e = (int)sizeof(float) * 8;
416
+
417
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
418
+ }
419
+
420
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
421
+ {
422
+ int e = (int)sizeof(char) * 8;
423
+
424
+ return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
425
+ }
426
+
427
+ template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
428
+ {
429
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
430
+ }
431
+
432
+ /* Signed 8-bit normalized integer formats */
433
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
434
+ {
435
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
436
+ }
437
+
438
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
439
+ {
440
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
441
+ }
442
+
443
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
444
+ {
445
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
446
+ }
447
+
448
+ /* Unsigned 8-bit normalized integer formats */
449
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
450
+ {
451
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
452
+ }
453
+
454
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
455
+ {
456
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
457
+ }
458
+
459
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
460
+ {
461
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
462
+ }
463
+
464
+ /* Signed 16-bit normalized integer formats */
465
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
466
+ {
467
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
468
+ }
469
+
470
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
471
+ {
472
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
473
+ }
474
+
475
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
476
+ {
477
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
478
+ }
479
+
480
+ /* Unsigned 16-bit normalized integer formats */
481
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
482
+ {
483
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
484
+ }
485
+
486
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
487
+ {
488
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
489
+ }
490
+
491
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
492
+ {
493
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
494
+ }
495
+
496
+ /* NV12 format */
497
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
498
+ {
499
+ return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
500
+ }
501
+
502
+ /* Int101010 format */
503
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized1010102>(void)
504
+ {
505
+ return cudaCreateChannelDesc(10, 10, 10, 2, cudaChannelFormatKindUnsignedNormalized1010102);
506
+ }
507
+
508
+ /* BC1 format */
509
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
510
+ {
511
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
512
+ }
513
+
514
+ /* BC1sRGB format */
515
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
516
+ {
517
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
518
+ }
519
+
520
+ /* BC2 format */
521
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
522
+ {
523
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
524
+ }
525
+
526
+ /* BC2sRGB format */
527
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
528
+ {
529
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
530
+ }
531
+
532
+ /* BC3 format */
533
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
534
+ {
535
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
536
+ }
537
+
538
+ /* BC3sRGB format */
539
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
540
+ {
541
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
542
+ }
543
+
544
+ /* BC4 unsigned format */
545
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
546
+ {
547
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
548
+ }
549
+
550
+ /* BC4 signed format */
551
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
552
+ {
553
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
554
+ }
555
+
556
+ /* BC5 unsigned format */
557
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
558
+ {
559
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
560
+ }
561
+
562
+ /* BC5 signed format */
563
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
564
+ {
565
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
566
+ }
567
+
568
+ /* BC6H unsigned format */
569
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
570
+ {
571
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
572
+ }
573
+
574
+ /* BC6H signed format */
575
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
576
+ {
577
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
578
+ }
579
+
580
+ /* BC7 format */
581
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
582
+ {
583
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
584
+ }
585
+
586
+ /* BC7sRGB format */
587
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
588
+ {
589
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
590
+ }
591
+
592
+ #endif /* __cplusplus */
593
+
594
+ /** @} */
595
+ /** @} */ /* END CUDART_TEXTURE_HL */
596
+
597
+ #endif /* !__CHANNEL_DESCRIPTOR_H__ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/common_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/common_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
65
+ #endif
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h ADDED
@@ -0,0 +1,1743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _COOPERATIVE_GROUPS_H_
51
+ #define _COOPERATIVE_GROUPS_H_
52
+
53
+ #if defined(__cplusplus) && defined(__CUDACC__)
54
+
55
+ #include "cooperative_groups/details/info.h"
56
+ #include "cooperative_groups/details/driver_abi.h"
57
+ #include "cooperative_groups/details/helpers.h"
58
+ #include "cooperative_groups/details/memory.h"
59
+
60
+ #if defined(_CG_HAS_STL_ATOMICS)
61
+ #include <cuda/atomic>
62
+ #define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
63
+ #else
64
+ #define _CG_THREAD_SCOPE(scope)
65
+ #endif
66
+
67
+ _CG_BEGIN_NAMESPACE
68
+
69
+ namespace details {
70
+ _CG_CONST_DECL unsigned int coalesced_group_id = 1;
71
+ _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
72
+ _CG_CONST_DECL unsigned int grid_group_id = 3;
73
+ _CG_CONST_DECL unsigned int thread_block_id = 4;
74
+ _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
75
+ _CG_CONST_DECL unsigned int cluster_group_id = 6;
76
+ }
77
+
78
+ /**
79
+ * class thread_group;
80
+ *
81
+ * Generic thread group type, into which all groups are convertible.
82
+ * It acts as a container for all storage necessary for the derived groups,
83
+ * and will dispatch the API calls to the correct derived group. This means
84
+ * that all derived groups must implement the same interface as thread_group.
85
+ */
86
+ class thread_group
87
+ {
88
+ protected:
89
+ struct group_data {
90
+ unsigned int _unused : 1;
91
+ unsigned int type : 7, : 0;
92
+ };
93
+
94
+ struct gg_data {
95
+ details::grid_workspace *gridWs;
96
+ };
97
+
98
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
99
+ struct mg_data {
100
+ unsigned long long _unused : 1;
101
+ unsigned long long type : 7;
102
+ unsigned long long handle : 56;
103
+ const details::multi_grid::multi_grid_functions *functions;
104
+ };
105
+ #endif
106
+
107
+ struct tg_data {
108
+ unsigned int is_tiled : 1;
109
+ unsigned int type : 7;
110
+ unsigned int size : 24;
111
+ // packed to 4b
112
+ unsigned int metaGroupSize : 16;
113
+ unsigned int metaGroupRank : 16;
114
+ // packed to 8b
115
+ unsigned int mask;
116
+ // packed to 12b
117
+ unsigned int _res;
118
+ };
119
+
120
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
121
+ friend class thread_block;
122
+
123
+ union __align__(8) {
124
+ group_data group;
125
+ tg_data coalesced;
126
+ gg_data grid;
127
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
128
+ mg_data multi_grid;
129
+ #endif
130
+ } _data;
131
+
132
+ _CG_QUALIFIER thread_group operator=(const thread_group& src);
133
+
134
+ _CG_QUALIFIER thread_group(unsigned int type) {
135
+ _data.group.type = type;
136
+ _data.group._unused = false;
137
+ }
138
+
139
+ #ifdef _CG_CPP11_FEATURES
140
+ static_assert(sizeof(tg_data) <= 16, "Failed size check");
141
+ static_assert(sizeof(gg_data) <= 16, "Failed size check");
142
+ # ifdef _CG_ABI_EXPERIMENTAL
143
+ static_assert(sizeof(mg_data) <= 16, "Failed size check");
144
+ # endif
145
+ #endif
146
+
147
+ public:
148
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
149
+
150
+ _CG_QUALIFIER unsigned long long size() const;
151
+ _CG_QUALIFIER unsigned long long num_threads() const;
152
+ _CG_QUALIFIER unsigned long long thread_rank() const;
153
+ _CG_QUALIFIER void sync() const;
154
+ _CG_QUALIFIER unsigned int get_type() const {
155
+ return _data.group.type;
156
+ }
157
+
158
+ };
159
+
160
+ template <unsigned int TyId>
161
+ struct thread_group_base : public thread_group {
162
+ _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
163
+ _CG_STATIC_CONST_DECL unsigned int id = TyId;
164
+ };
165
+
166
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
167
+
168
+ /**
169
+ * class multi_grid_group;
170
+ *
171
+ * Threads within this this group are guaranteed to be co-resident on the
172
+ * same system, on multiple devices within the same launched kernels.
173
+ * To use this group, the kernel must have been launched with
174
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
175
+ * and the device must support it (queryable device attribute).
176
+ *
177
+ * Constructed via this_multi_grid();
178
+ */
179
+
180
+
181
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
182
+ class multi_grid_group;
183
+
184
+ // Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
185
+ template <typename = void>
186
+ __device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
187
+
188
+ class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
189
+ {
190
+ private:
191
+ template <typename = void>
192
+ _CG_QUALIFIER multi_grid_group() {
193
+ _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
194
+ _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
195
+ }
196
+
197
+ friend multi_grid_group this_multi_grid<void>();
198
+
199
+ public:
200
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
201
+
202
+ _CG_QUALIFIER bool is_valid() const {
203
+ return (_data.multi_grid.handle != 0);
204
+ }
205
+
206
+ _CG_QUALIFIER void sync() const {
207
+ if (!is_valid()) {
208
+ _CG_ABORT();
209
+ }
210
+ _data.multi_grid.functions->sync(_data.multi_grid.handle);
211
+ }
212
+
213
+ _CG_QUALIFIER unsigned long long num_threads() const {
214
+ _CG_ASSERT(is_valid());
215
+ return _data.multi_grid.functions->size(_data.multi_grid.handle);
216
+ }
217
+
218
+ _CG_QUALIFIER unsigned long long size() const {
219
+ return num_threads();
220
+ }
221
+
222
+ _CG_QUALIFIER unsigned long long thread_rank() const {
223
+ _CG_ASSERT(is_valid());
224
+ return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
225
+ }
226
+
227
+ _CG_QUALIFIER unsigned int grid_rank() const {
228
+ _CG_ASSERT(is_valid());
229
+ return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
230
+ }
231
+
232
+ _CG_QUALIFIER unsigned int num_grids() const {
233
+ _CG_ASSERT(is_valid());
234
+ return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
235
+ }
236
+ };
237
+ # else
238
+ class multi_grid_group
239
+ {
240
+ private:
241
+ unsigned long long _handle;
242
+ unsigned int _size;
243
+ unsigned int _rank;
244
+
245
+ friend _CG_QUALIFIER multi_grid_group this_multi_grid();
246
+
247
+ _CG_QUALIFIER multi_grid_group() {
248
+ _handle = details::multi_grid::get_intrinsic_handle();
249
+ _size = details::multi_grid::size(_handle);
250
+ _rank = details::multi_grid::thread_rank(_handle);
251
+ }
252
+
253
+ public:
254
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
255
+
256
+ _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
257
+ return (_handle != 0);
258
+ }
259
+
260
+ _CG_QUALIFIER _CG_DEPRECATED void sync() const {
261
+ if (!is_valid()) {
262
+ _CG_ABORT();
263
+ }
264
+ details::multi_grid::sync(_handle);
265
+ }
266
+
267
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
268
+ _CG_ASSERT(is_valid());
269
+ return _size;
270
+ }
271
+
272
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
273
+ return num_threads();
274
+ }
275
+
276
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
277
+ _CG_ASSERT(is_valid());
278
+ return _rank;
279
+ }
280
+
281
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
282
+ _CG_ASSERT(is_valid());
283
+ return (details::multi_grid::grid_rank(_handle));
284
+ }
285
+
286
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
287
+ _CG_ASSERT(is_valid());
288
+ return (details::multi_grid::num_grids(_handle));
289
+ }
290
+ };
291
+ # endif
292
+
293
+ /**
294
+ * multi_grid_group this_multi_grid()
295
+ *
296
+ * Constructs a multi_grid_group
297
+ */
298
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
299
+ template <typename>
300
+ __device__
301
+ #else
302
+ _CG_QUALIFIER
303
+ # endif
304
+ _CG_DEPRECATED
305
+ multi_grid_group this_multi_grid()
306
+ {
307
+ return multi_grid_group();
308
+ }
309
+ #endif
310
+
311
+ /**
312
+ * class grid_group;
313
+ *
314
+ * Threads within this this group are guaranteed to be co-resident on the
315
+ * same device within the same launched kernel. To use this group, the kernel
316
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
317
+ * and the device must support it (queryable device attribute).
318
+ *
319
+ * Constructed via this_grid();
320
+ */
321
+ class grid_group : public thread_group_base<details::grid_group_id>
322
+ {
323
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
324
+ friend _CG_QUALIFIER grid_group this_grid();
325
+
326
+ private:
327
+ _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
328
+ _data.grid.gridWs = gridWs;
329
+ }
330
+
331
+ public:
332
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
333
+
334
+ _CG_QUALIFIER bool is_valid() const {
335
+ return (_data.grid.gridWs != NULL);
336
+ }
337
+
338
+ _CG_QUALIFIER void sync() const {
339
+ if (!is_valid()) {
340
+ _CG_ABORT();
341
+ }
342
+ details::grid::sync(&_data.grid.gridWs->barrier);
343
+ }
344
+
345
+ #if defined(_CG_CPP11_FEATURES)
346
+ using arrival_token = unsigned int;
347
+
348
+ _CG_QUALIFIER arrival_token barrier_arrive() const {
349
+ if (!is_valid()) {
350
+ _CG_ABORT();
351
+ }
352
+ return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
353
+ }
354
+
355
+ _CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
356
+ details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
357
+ }
358
+ #endif
359
+
360
+ _CG_STATIC_QUALIFIER unsigned long long size() {
361
+ return details::grid::size();
362
+ }
363
+
364
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
365
+ return details::grid::grid_dim();
366
+ }
367
+
368
+ _CG_STATIC_QUALIFIER dim3 dim_threads() {
369
+ return details::grid::dim_threads();
370
+ }
371
+
372
+ _CG_STATIC_QUALIFIER unsigned long long num_threads() {
373
+ return details::grid::num_threads();
374
+ }
375
+
376
+ _CG_STATIC_QUALIFIER dim3 thread_index() {
377
+ return details::grid::thread_index();
378
+ }
379
+
380
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
381
+ return details::grid::thread_rank();
382
+ }
383
+
384
+ _CG_STATIC_QUALIFIER dim3 dim_blocks() {
385
+ return details::grid::dim_blocks();
386
+ }
387
+
388
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
389
+ return details::grid::num_blocks();
390
+ }
391
+
392
+ _CG_STATIC_QUALIFIER dim3 block_index() {
393
+ return details::grid::block_index();
394
+ }
395
+
396
+ _CG_STATIC_QUALIFIER unsigned long long block_rank() {
397
+ return details::grid::block_rank();
398
+ }
399
+
400
+ # if defined(_CG_HAS_CLUSTER_GROUP)
401
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
402
+ return details::grid::dim_clusters();
403
+ }
404
+
405
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
406
+ return details::grid::num_clusters();
407
+ }
408
+
409
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
410
+ return details::grid::cluster_index();
411
+ }
412
+
413
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
414
+ return details::grid::cluster_rank();
415
+ }
416
+ # endif
417
+ };
418
+
419
+ _CG_QUALIFIER grid_group this_grid() {
420
+ // Load a workspace from the driver
421
+ grid_group gg(details::get_grid_workspace());
422
+ #ifdef _CG_DEBUG
423
+ // *all* threads must be available to synchronize
424
+ gg.sync();
425
+ #endif // _CG_DEBUG
426
+ return gg;
427
+ }
428
+
429
+ #if defined(_CG_HAS_CLUSTER_GROUP)
430
+ /**
431
+ * class cluster_group
432
+ *
433
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
434
+ * divided along all dimensions to form groups of blocks, each group of which is
435
+ * a block cluster. Clustered grids are subject to various restrictions and
436
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
437
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
438
+ * grids are subject to additional occupancy limitations due to per-cluster
439
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
440
+ * be a cooperative group, with access to all cooperative group capabilities, as
441
+ * well as cluster specific capabilities and accelerations. A cluster_group
442
+ * represents a block cluster.
443
+ *
444
+ * Constructed via this_cluster_group();
445
+ */
446
+ class cluster_group : public thread_group_base<details::cluster_group_id>
447
+ {
448
+ // Friends
449
+ friend _CG_QUALIFIER cluster_group this_cluster();
450
+
451
+ // Disable constructor
452
+ _CG_QUALIFIER cluster_group()
453
+ {
454
+ }
455
+
456
+ public:
457
+ //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
458
+
459
+ using arrival_token = struct {};
460
+
461
+ // Functionality exposed by the group
462
+ _CG_STATIC_QUALIFIER void sync()
463
+ {
464
+ return details::cluster::sync();
465
+ }
466
+
467
+ _CG_STATIC_QUALIFIER arrival_token barrier_arrive()
468
+ {
469
+ details::cluster::barrier_arrive();
470
+ return arrival_token();
471
+ }
472
+
473
+ _CG_STATIC_QUALIFIER void barrier_wait()
474
+ {
475
+ return details::cluster::barrier_wait();
476
+ }
477
+
478
+ _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
479
+ {
480
+ return details::cluster::barrier_wait();
481
+ }
482
+
483
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
484
+ {
485
+ return details::cluster::query_shared_rank(addr);
486
+ }
487
+
488
+ template <typename T>
489
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
490
+ {
491
+ return details::cluster::map_shared_rank(addr, rank);
492
+ }
493
+
494
+ _CG_STATIC_QUALIFIER dim3 block_index()
495
+ {
496
+ return details::cluster::block_index();
497
+ }
498
+
499
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
500
+ {
501
+ return details::cluster::block_rank();
502
+ }
503
+
504
+ _CG_STATIC_QUALIFIER dim3 thread_index()
505
+ {
506
+ return details::cluster::thread_index();
507
+ }
508
+
509
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
510
+ {
511
+ return details::cluster::thread_rank();
512
+ }
513
+
514
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
515
+ {
516
+ return details::cluster::dim_blocks();
517
+ }
518
+
519
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
520
+ {
521
+ return details::cluster::num_blocks();
522
+ }
523
+
524
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
525
+ {
526
+ return details::cluster::dim_threads();
527
+ }
528
+
529
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
530
+ {
531
+ return details::cluster::num_threads();
532
+ }
533
+
534
+ // Legacy aliases
535
+ _CG_STATIC_QUALIFIER unsigned int size()
536
+ {
537
+ return num_threads();
538
+ }
539
+ };
540
+
541
+ /*
542
+ * cluster_group this_cluster()
543
+ *
544
+ * Constructs a cluster_group
545
+ */
546
+ _CG_QUALIFIER cluster_group this_cluster()
547
+ {
548
+ cluster_group cg;
549
+ #ifdef _CG_DEBUG
550
+ cg.sync();
551
+ #endif
552
+ return cg;
553
+ }
554
+ #endif
555
+
556
+ #if defined(_CG_CPP11_FEATURES)
557
+ class thread_block;
558
+ template <unsigned int MaxBlockSize>
559
+ _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
560
+ #endif
561
+
562
+ /**
563
+ * class thread_block
564
+ *
565
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
566
+ * each block are guaranteed to reside on the same streaming multiprocessor.
567
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
568
+ *
569
+ * Constructed via this_thread_block();
570
+ */
571
+ class thread_block : public thread_group_base<details::thread_block_id>
572
+ {
573
+ // Friends
574
+ friend _CG_QUALIFIER thread_block this_thread_block();
575
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
576
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
577
+
578
+ #if defined(_CG_CPP11_FEATURES)
579
+ template <unsigned int MaxBlockSize>
580
+ friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
581
+ template <unsigned int Size>
582
+ friend class __static_size_multi_warp_tile_base;
583
+
584
+ details::multi_warp_scratch* const tile_memory;
585
+
586
+ template <unsigned int MaxBlockSize>
587
+ _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
588
+ tile_memory(details::get_scratch_ptr(&scratch)) {
589
+ #ifdef _CG_DEBUG
590
+ if (num_threads() > MaxBlockSize) {
591
+ details::abort();
592
+ }
593
+ #endif
594
+
595
+
596
+ #if defined(_CG_USER_PROVIDED_SHARED_MEMORY)
597
+ #define _CG_SKIP_BARRIER_INIT_TARGET NV_NO_TARGET
598
+ #else
599
+ #define _CG_SKIP_BARRIER_INIT_TARGET NV_PROVIDES_SM_80
600
+ #endif
601
+ NV_IF_ELSE_TARGET(
602
+ _CG_SKIP_BARRIER_INIT_TARGET,
603
+ // skip if clause
604
+ ,
605
+ (tile_memory->init_barriers(thread_rank());
606
+ sync();)
607
+ )
608
+ }
609
+ #endif
610
+ #undef _CG_SKIP_BARRIER_INIT_TARGET
611
+
612
+ // Disable constructor
613
+ _CG_QUALIFIER thread_block()
614
+ #if defined(_CG_CPP11_FEATURES)
615
+ : tile_memory(details::get_scratch_ptr(NULL))
616
+ #endif
617
+ { }
618
+
619
+ // Internal Use
620
+ _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
621
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
622
+
623
+ // Invalid, immediately fail
624
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
625
+ details::abort();
626
+ return (thread_block());
627
+ }
628
+
629
+ unsigned int mask;
630
+ unsigned int base_offset = thread_rank() & (~(tilesz - 1));
631
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
632
+
633
+ mask = (unsigned int)(-1) >> (32 - masklength);
634
+ mask <<= (details::laneid() & ~(tilesz - 1));
635
+ thread_group tile = thread_group(details::coalesced_group_id);
636
+ tile._data.coalesced.mask = mask;
637
+ tile._data.coalesced.size = __popc(mask);
638
+ tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
639
+ tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
640
+ tile._data.coalesced.is_tiled = true;
641
+ return (tile);
642
+ }
643
+
644
+ public:
645
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
646
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
647
+
648
+ _CG_STATIC_QUALIFIER void sync() {
649
+ details::cta::sync();
650
+ }
651
+
652
+ #if defined(_CG_CPP11_FEATURES)
653
+ struct arrival_token {};
654
+
655
+ _CG_QUALIFIER arrival_token barrier_arrive() const {
656
+ return arrival_token();
657
+ }
658
+
659
+ _CG_QUALIFIER void barrier_wait(arrival_token&&) const {
660
+ details::cta::sync();
661
+ }
662
+ #endif
663
+
664
+ _CG_STATIC_QUALIFIER unsigned int size() {
665
+ return details::cta::size();
666
+ }
667
+
668
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
669
+ return details::cta::thread_rank();
670
+ }
671
+
672
+ // Additional functionality exposed by the group
673
+ _CG_STATIC_QUALIFIER dim3 group_index() {
674
+ return details::cta::group_index();
675
+ }
676
+
677
+ _CG_STATIC_QUALIFIER dim3 thread_index() {
678
+ return details::cta::thread_index();
679
+ }
680
+
681
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
682
+ return details::cta::block_dim();
683
+ }
684
+
685
+ _CG_STATIC_QUALIFIER dim3 dim_threads() {
686
+ return details::cta::dim_threads();
687
+ }
688
+
689
+ _CG_STATIC_QUALIFIER unsigned int num_threads() {
690
+ return details::cta::num_threads();
691
+ }
692
+
693
+ };
694
+
695
+ /**
696
+ * thread_block this_thread_block()
697
+ *
698
+ * Constructs a thread_block group
699
+ */
700
+ _CG_QUALIFIER thread_block this_thread_block()
701
+ {
702
+ return (thread_block());
703
+ }
704
+
705
+ #if defined(_CG_CPP11_FEATURES)
706
+ template <unsigned int MaxBlockSize>
707
+ _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
708
+ return (thread_block(scratch));
709
+ }
710
+ #endif
711
+
712
+ /**
713
+ * class coalesced_group
714
+ *
715
+ * A group representing the current set of converged threads in a warp.
716
+ * The size of the group is not guaranteed and it may return a group of
717
+ * only one thread (itself).
718
+ *
719
+ * This group exposes warp-synchronous builtins.
720
+ * Constructed via coalesced_threads();
721
+ */
722
+ class coalesced_group : public thread_group_base<details::coalesced_group_id>
723
+ {
724
+ private:
725
+ friend _CG_QUALIFIER coalesced_group coalesced_threads();
726
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
727
+ friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
728
+ friend class details::_coalesced_group_data_access;
729
+
730
+ _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
731
+ unsigned int member_pack = 0;
732
+ unsigned int member_rank = 0;
733
+ for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
734
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
735
+ if (lane_bit) {
736
+ if (laneMask & lane_bit)
737
+ member_pack |= 1 << member_rank;
738
+ member_rank++;
739
+ }
740
+ }
741
+ return (member_pack);
742
+ }
743
+
744
+ // Internal Use
745
+ _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
746
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
747
+
748
+ // Invalid, immediately fail
749
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
750
+ details::abort();
751
+ return (coalesced_group(0));
752
+ }
753
+ if (size() <= tilesz) {
754
+ return (*this);
755
+ }
756
+
757
+ if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
758
+ unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
759
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
760
+ unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
761
+
762
+ mask <<= (details::laneid() & ~(tilesz - 1));
763
+ coalesced_group coalesced_tile = coalesced_group(mask);
764
+ coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
765
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
766
+ coalesced_tile._data.coalesced.is_tiled = true;
767
+ return (coalesced_tile);
768
+ }
769
+ else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
770
+ unsigned int mask = 0;
771
+ unsigned int member_rank = 0;
772
+ int seen_lanes = (thread_rank() / tilesz) * tilesz;
773
+ for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
774
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
775
+ if (lane_bit) {
776
+ if (seen_lanes <= 0 && member_rank < tilesz) {
777
+ mask |= lane_bit;
778
+ member_rank++;
779
+ }
780
+ seen_lanes--;
781
+ }
782
+ }
783
+ coalesced_group coalesced_tile = coalesced_group(mask);
784
+ // Override parent with the size of this group
785
+ coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
786
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
787
+ return coalesced_tile;
788
+ }
789
+ else {
790
+ // None in _CG_VERSION 1000
791
+ details::abort();
792
+ }
793
+
794
+ return (coalesced_group(0));
795
+ }
796
+
797
+ protected:
798
+ _CG_QUALIFIER coalesced_group(unsigned int mask) {
799
+ _data.coalesced.mask = mask;
800
+ _data.coalesced.size = __popc(mask);
801
+ _data.coalesced.metaGroupRank = 0;
802
+ _data.coalesced.metaGroupSize = 1;
803
+ _data.coalesced.is_tiled = false;
804
+ }
805
+
806
+ _CG_QUALIFIER unsigned int get_mask() const {
807
+ return (_data.coalesced.mask);
808
+ }
809
+
810
+ public:
811
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
812
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
813
+
814
+ _CG_QUALIFIER unsigned int num_threads() const {
815
+ return _data.coalesced.size;
816
+ }
817
+
818
+ _CG_QUALIFIER unsigned int size() const {
819
+ return num_threads();
820
+ }
821
+
822
+ _CG_QUALIFIER unsigned int thread_rank() const {
823
+ return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
824
+ }
825
+
826
+ // Rank of this group in the upper level of the hierarchy
827
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
828
+ return _data.coalesced.metaGroupRank;
829
+ }
830
+
831
+ // Total num partitions created out of all CTAs when the group was created
832
+ _CG_QUALIFIER unsigned int meta_group_size() const {
833
+ return _data.coalesced.metaGroupSize;
834
+ }
835
+
836
+ _CG_QUALIFIER void sync() const {
837
+ __syncwarp(_data.coalesced.mask);
838
+ }
839
+
840
+ #ifdef _CG_CPP11_FEATURES
841
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
842
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
843
+ unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
844
+ (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
845
+
846
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
847
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
848
+ }
849
+
850
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
851
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
852
+ if (size() == 32) {
853
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
854
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
855
+ }
856
+
857
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
858
+
859
+ if (lane >= 32)
860
+ lane = details::laneid();
861
+
862
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
863
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
864
+ }
865
+
866
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
867
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
868
+ if (size() == 32) {
869
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
870
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
871
+ }
872
+
873
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
874
+ if (lane >= 32)
875
+ lane = details::laneid();
876
+
877
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
878
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
879
+ }
880
+ #else
881
+ template <typename TyIntegral>
882
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
883
+ details::assert_if_not_arithmetic<TyIntegral>();
884
+ unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
885
+ (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
886
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
887
+ }
888
+
889
+ template <typename TyIntegral>
890
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
891
+ details::assert_if_not_arithmetic<TyIntegral>();
892
+ if (size() == 32) {
893
+ return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
894
+ }
895
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
896
+ if (lane >= 32) lane = details::laneid();
897
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
898
+ }
899
+
900
+ template <typename TyIntegral>
901
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
902
+ details::assert_if_not_arithmetic<TyIntegral>();
903
+ if (size() == 32) {
904
+ return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
905
+ }
906
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
907
+ if (lane >= 32) lane = details::laneid();
908
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
909
+ }
910
+ #endif
911
+
912
+ _CG_QUALIFIER int any(int predicate) const {
913
+ return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
914
+ }
915
+ _CG_QUALIFIER int all(int predicate) const {
916
+ return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
917
+ }
918
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
919
+ if (size() == 32) {
920
+ return (__ballot_sync(0xFFFFFFFF, predicate));
921
+ }
922
+ unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
923
+ return (_packLanes(lane_ballot));
924
+ }
925
+
926
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
927
+
928
+ template <typename TyIntegral>
929
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
930
+ details::assert_if_not_arithmetic<TyIntegral>();
931
+ if (size() == 32) {
932
+ return (__match_any_sync(0xFFFFFFFF, val));
933
+ }
934
+ unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
935
+ return (_packLanes(lane_match));
936
+ }
937
+
938
+ template <typename TyIntegral>
939
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
940
+ details::assert_if_not_arithmetic<TyIntegral>();
941
+ if (size() == 32) {
942
+ return (__match_all_sync(0xFFFFFFFF, val, &pred));
943
+ }
944
+ unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
945
+ return (_packLanes(lane_match));
946
+ }
947
+
948
+ #endif /* !_CG_HAS_MATCH_COLLECTIVE */
949
+
950
+ };
951
+
952
+ _CG_QUALIFIER coalesced_group coalesced_threads()
953
+ {
954
+ return (coalesced_group(__activemask()));
955
+ }
956
+
957
+ namespace details {
958
+ template <unsigned int Size> struct verify_thread_block_tile_size;
959
+ template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
960
+ template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
961
+ template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
962
+ template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
963
+ template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
964
+ template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
965
+
966
+ #ifdef _CG_CPP11_FEATURES
967
+ template <unsigned int Size>
968
+ using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
969
+
970
+ template <unsigned int Size>
971
+ using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
972
+ template <unsigned int Size>
973
+ using _is_multi_warp =
974
+ _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
975
+
976
+ template <unsigned int Size>
977
+ using _is_valid_single_warp_tile =
978
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
979
+ template <unsigned int Size>
980
+ using _is_valid_multi_warp_tile =
981
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
982
+ #else
983
+ template <unsigned int Size>
984
+ struct _is_multi_warp {
985
+ static const bool value = false;
986
+ };
987
+ #endif
988
+ }
989
+
990
+ template <unsigned int Size>
991
+ class __static_size_tile_base
992
+ {
993
+ protected:
994
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
995
+
996
+ public:
997
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
998
+
999
+ // Rank of thread within tile
1000
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
1001
+ return (details::cta::thread_rank() & (numThreads - 1));
1002
+ }
1003
+
1004
+ // Number of threads within tile
1005
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
1006
+ return numThreads;
1007
+ }
1008
+
1009
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
1010
+ return num_threads();
1011
+ }
1012
+ };
1013
+
1014
+ template <unsigned int Size>
1015
+ class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
1016
+ {
1017
+ friend class details::_coalesced_group_data_access;
1018
+ typedef details::tile::tile_helpers<Size> th;
1019
+
1020
+ #ifdef _CG_CPP11_FEATURES
1021
+ static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
1022
+ #else
1023
+ typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
1024
+ #endif
1025
+ using __static_size_tile_base<Size>::numThreads;
1026
+ _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
1027
+
1028
+ protected:
1029
+ _CG_STATIC_QUALIFIER unsigned int build_mask() {
1030
+ unsigned int mask = fullMask;
1031
+ if (numThreads != 32) {
1032
+ // [0,31] representing the current active thread in the warp
1033
+ unsigned int laneId = details::laneid();
1034
+ // shift mask according to the partition it belongs to
1035
+ mask = th::tileMask << (laneId & ~(th::laneMask));
1036
+ }
1037
+ return (mask);
1038
+ }
1039
+
1040
+ public:
1041
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
1042
+
1043
+ _CG_STATIC_QUALIFIER void sync() {
1044
+ __syncwarp(build_mask());
1045
+ }
1046
+
1047
+ #ifdef _CG_CPP11_FEATURES
1048
+ // PTX supported collectives
1049
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1050
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
1051
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
1052
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
1053
+ }
1054
+
1055
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1056
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
1057
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
1058
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1059
+ }
1060
+
1061
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1062
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
1063
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
1064
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1065
+ }
1066
+
1067
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1068
+ _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
1069
+ return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
1070
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
1071
+ }
1072
+ #else
1073
+ template <typename TyIntegral>
1074
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
1075
+ details::assert_if_not_arithmetic<TyIntegral>();
1076
+ return (__shfl_sync(build_mask(), var, srcRank, numThreads));
1077
+ }
1078
+
1079
+ template <typename TyIntegral>
1080
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
1081
+ details::assert_if_not_arithmetic<TyIntegral>();
1082
+ return (__shfl_down_sync(build_mask(), var, delta, numThreads));
1083
+ }
1084
+
1085
+ template <typename TyIntegral>
1086
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
1087
+ details::assert_if_not_arithmetic<TyIntegral>();
1088
+ return (__shfl_up_sync(build_mask(), var, delta, numThreads));
1089
+ }
1090
+
1091
+ template <typename TyIntegral>
1092
+ _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
1093
+ details::assert_if_not_arithmetic<TyIntegral>();
1094
+ return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
1095
+ }
1096
+ #endif //_CG_CPP11_FEATURES
1097
+
1098
+ _CG_QUALIFIER int any(int predicate) const {
1099
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1100
+ return (lane_ballot != 0);
1101
+ }
1102
+ _CG_QUALIFIER int all(int predicate) const {
1103
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1104
+ return (lane_ballot == build_mask());
1105
+ }
1106
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
1107
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1108
+ return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
1109
+ }
1110
+
1111
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
1112
+ template <typename TyIntegral>
1113
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
1114
+ details::assert_if_not_arithmetic<TyIntegral>();
1115
+ unsigned int lane_match = __match_any_sync(build_mask(), val);
1116
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1117
+ }
1118
+
1119
+ template <typename TyIntegral>
1120
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
1121
+ details::assert_if_not_arithmetic<TyIntegral>();
1122
+ unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
1123
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1124
+ }
1125
+ #endif
1126
+
1127
+ };
1128
+
1129
+ template <unsigned int Size, typename ParentT>
1130
+ class __static_parent_thread_block_tile_base
1131
+ {
1132
+ public:
1133
+ // Rank of this group in the upper level of the hierarchy
1134
+ _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
1135
+ return ParentT::thread_rank() / Size;
1136
+ }
1137
+
1138
+ // Total num partitions created out of all CTAs when the group was created
1139
+ _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
1140
+ return (ParentT::size() + Size - 1) / Size;
1141
+ }
1142
+ };
1143
+
1144
+ /**
1145
+ * class thread_block_tile<unsigned int Size, ParentT = void>
1146
+ *
1147
+ * Statically-sized group type, representing one tile of a thread block.
1148
+ * The only specializations currently supported are those with native
1149
+ * hardware support (1/2/4/8/16/32)
1150
+ *
1151
+ * This group exposes warp-synchronous builtins.
1152
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
1153
+ */
1154
+
1155
+ template <unsigned int Size, typename ParentT = void>
1156
+ class __single_warp_thread_block_tile :
1157
+ public __static_size_thread_block_tile_base<Size>,
1158
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1159
+ {
1160
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1161
+ friend class details::_coalesced_group_data_access;
1162
+
1163
+ protected:
1164
+ _CG_QUALIFIER __single_warp_thread_block_tile() { };
1165
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
1166
+
1167
+ _CG_STATIC_QUALIFIER unsigned int get_mask() {
1168
+ return __static_size_thread_block_tile_base<Size>::build_mask();
1169
+ }
1170
+ };
1171
+
1172
+ template <unsigned int Size>
1173
+ class __single_warp_thread_block_tile<Size, void> :
1174
+ public __static_size_thread_block_tile_base<Size>,
1175
+ public thread_group_base<details::coalesced_group_id>
1176
+ {
1177
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
1178
+
1179
+ template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
1180
+ friend class details::_coalesced_group_data_access;
1181
+
1182
+ typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
1183
+
1184
+ protected:
1185
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
1186
+ _data.coalesced.mask = staticSizeBaseT::build_mask();
1187
+ _data.coalesced.size = numThreads;
1188
+ _data.coalesced.metaGroupRank = meta_group_rank;
1189
+ _data.coalesced.metaGroupSize = meta_group_size;
1190
+ _data.coalesced.is_tiled = true;
1191
+ }
1192
+
1193
+ _CG_QUALIFIER unsigned int get_mask() const {
1194
+ return (_data.coalesced.mask);
1195
+ }
1196
+
1197
+ public:
1198
+ using staticSizeBaseT::sync;
1199
+ using staticSizeBaseT::size;
1200
+ using staticSizeBaseT::num_threads;
1201
+ using staticSizeBaseT::thread_rank;
1202
+
1203
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1204
+ return _data.coalesced.metaGroupRank;
1205
+ }
1206
+
1207
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1208
+ return _data.coalesced.metaGroupSize;
1209
+ }
1210
+ };
1211
+
1212
+ /**
1213
+ * Outer level API calls
1214
+ * void sync(GroupT) - see <group_type>.sync()
1215
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
1216
+ * void group_size(GroupT) - see <group_type>.size()
1217
+ */
1218
+ template <class GroupT>
1219
+ _CG_QUALIFIER void sync(GroupT const &g)
1220
+ {
1221
+ g.sync();
1222
+ }
1223
+
1224
+ // TODO: Use a static dispatch to determine appropriate return type
1225
+ // C++03 is stuck with unsigned long long for now
1226
+ #ifdef _CG_CPP11_FEATURES
1227
+ template <class GroupT>
1228
+ _CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
1229
+ return g.thread_rank();
1230
+ }
1231
+
1232
+
1233
+ template <class GroupT>
1234
+ _CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
1235
+ return g.num_threads();
1236
+ }
1237
+ #else
1238
+ template <class GroupT>
1239
+ _CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
1240
+ return static_cast<unsigned long long>(g.thread_rank());
1241
+ }
1242
+
1243
+
1244
+ template <class GroupT>
1245
+ _CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
1246
+ return static_cast<unsigned long long>(g.num_threads());
1247
+ }
1248
+ #endif
1249
+
1250
+
1251
+ /**
1252
+ * tiled_partition
1253
+ *
1254
+ * The tiled_partition(parent, tilesz) method is a collective operation that
1255
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1256
+ *
1257
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
1258
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
1259
+ * will be members of the same subgroup.
1260
+ *
1261
+ * The implementation may cause the calling thread to wait until all the members
1262
+ * of the parent group have invoked the operation before resuming execution.
1263
+ *
1264
+ * Functionality is limited to power-of-two sized subgorup instances of at most
1265
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
1266
+ * tiled_partition() in _CG_VERSION 1000.
1267
+ */
1268
+ _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
1269
+ {
1270
+ if (parent.get_type() == details::coalesced_group_id) {
1271
+ const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
1272
+ return _cg->_get_tiled_threads(tilesz);
1273
+ }
1274
+ else {
1275
+ const thread_block *_tb = static_cast<const thread_block*>(&parent);
1276
+ return _tb->_get_tiled_threads(tilesz);
1277
+ }
1278
+ }
1279
+
1280
+ // Thread block type overload: returns a basic thread_group for now (may be specialized later)
1281
+ _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
1282
+ {
1283
+ return (parent._get_tiled_threads(tilesz));
1284
+ }
1285
+
1286
+ // Coalesced group type overload: retains its ability to stay coalesced
1287
+ _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
1288
+ {
1289
+ return (parent._get_tiled_threads(tilesz));
1290
+ }
1291
+
1292
+ namespace details {
1293
+ template <unsigned int Size, typename ParentT>
1294
+ class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
1295
+
1296
+ template <unsigned int Size, typename ParentT>
1297
+ _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
1298
+ return internal_thread_block_tile<Size, ParentT>();
1299
+ }
1300
+
1301
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1302
+ _CG_QUALIFIER TyVal multi_warp_collectives_helper(
1303
+ const GroupT& group,
1304
+ WarpLambda warp_lambda,
1305
+ InterWarpLambda inter_warp_lambda) {
1306
+ return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
1307
+ }
1308
+
1309
+ template <typename T, typename GroupT>
1310
+ _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
1311
+ return group.template get_scratch_location<T>(warp_id);
1312
+ }
1313
+
1314
+ template <typename GroupT>
1315
+ _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
1316
+ return group.get_sync_location();
1317
+ }
1318
+
1319
+ }
1320
+ /**
1321
+ * tiled_partition<tilesz>
1322
+ *
1323
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
1324
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1325
+ *
1326
+ * A total of ((size(parent)/tilesz) subgroups will be created,
1327
+ * therefore the parent group size must be evenly divisible by the tilesz.
1328
+ * The allow parent groups are thread_block or thread_block_tile<size>.
1329
+ *
1330
+ * The implementation may cause the calling thread to wait until all the members
1331
+ * of the parent group have invoked the operation before resuming execution.
1332
+ *
1333
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
1334
+ * The size(parent) must be greater than the template Size parameter
1335
+ * otherwise the results are undefined.
1336
+ */
1337
+
1338
+ #if defined(_CG_CPP11_FEATURES)
1339
+ template <unsigned int Size>
1340
+ class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
1341
+ {
1342
+ static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
1343
+
1344
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1345
+ friend __device__ TyVal details::multi_warp_collectives_helper(
1346
+ const GroupT& group,
1347
+ WarpLambda warp_lambda,
1348
+ InterWarpLambda inter_warp_lambda);
1349
+ template <typename T, typename GroupT>
1350
+ friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
1351
+ template <typename GroupT>
1352
+ friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
1353
+ template <unsigned int OtherSize>
1354
+ friend class __static_size_multi_warp_tile_base;
1355
+ using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
1356
+ using ThisType = __static_size_multi_warp_tile_base<Size>;
1357
+ _CG_STATIC_CONST_DECL int numWarps = Size / 32;
1358
+
1359
+ protected:
1360
+ details::multi_warp_scratch* const tile_memory;
1361
+
1362
+ template <typename GroupT>
1363
+ _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
1364
+ #if !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
1365
+ NV_IF_TARGET(NV_PROVIDES_SM_80,
1366
+ details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
1367
+ g.sync();
1368
+ )
1369
+ #endif
1370
+ }
1371
+
1372
+
1373
+ private:
1374
+ _CG_QUALIFIER details::barrier_t* get_sync_location() const {
1375
+ // Different group sizes use different barriers, all groups of a given size share one barrier.
1376
+ unsigned int sync_id = details::log2(Size / 64);
1377
+ return &tile_memory->barriers[sync_id];
1378
+ }
1379
+
1380
+ template <typename T>
1381
+ _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
1382
+ unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
1383
+ return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
1384
+ }
1385
+
1386
+ template <typename T>
1387
+ _CG_QUALIFIER T* get_scratch_location() const {
1388
+ unsigned int scratch_id = details::cta::thread_rank() / 32;
1389
+ return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
1390
+ }
1391
+
1392
+ template <typename TyVal>
1393
+ _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
1394
+ unsigned int src_warp = src / 32;
1395
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1396
+ details::barrier_t* sync_location = get_sync_location();
1397
+
1398
+ // Get warp slot of the source threads warp.
1399
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
1400
+
1401
+ if (warp.meta_group_rank() == src_warp) {
1402
+ warp.sync();
1403
+ // Put shuffled value into my warp slot and let my warp arrive at the barrier.
1404
+ if (thread_rank() == src) {
1405
+ *warp_scratch_location = val;
1406
+ }
1407
+ details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
1408
+ TyVal result = *warp_scratch_location;
1409
+ details::sync_warps_wait(sync_location, details::cta::thread_rank());
1410
+ return result;
1411
+ }
1412
+ else {
1413
+ // Wait for the source warp to arrive on the barrier.
1414
+ details::sync_warps_wait_for_specific_warp(sync_location,
1415
+ (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
1416
+ TyVal result = *warp_scratch_location;
1417
+ details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
1418
+ return result;
1419
+ }
1420
+ }
1421
+
1422
+ template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
1423
+ _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
1424
+ static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
1425
+ "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
1426
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1427
+ details::barrier_t* sync_location = get_sync_location();
1428
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>();
1429
+
1430
+ warp_lambda(warp, warp_scratch_location);
1431
+
1432
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
1433
+ auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
1434
+ if (subwarp.meta_group_rank() == 0) {
1435
+ TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
1436
+ inter_warp_lambda(subwarp, thread_scratch_location);
1437
+ }
1438
+ warp.sync();
1439
+ details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
1440
+ }
1441
+ TyVal result = *warp_scratch_location;
1442
+ return result;
1443
+ }
1444
+
1445
+ public:
1446
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
1447
+
1448
+ using __static_size_tile_base<Size>::thread_rank;
1449
+
1450
+ template <typename TyVal>
1451
+ _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
1452
+ static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
1453
+ "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
1454
+ return shfl_impl(val, src);
1455
+ }
1456
+
1457
+ _CG_QUALIFIER void sync() const {
1458
+ details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
1459
+ }
1460
+
1461
+ _CG_QUALIFIER int any(int predicate) const {
1462
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1463
+ *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
1464
+ };
1465
+ auto inter_warp_lambda =
1466
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1467
+ *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1468
+ };
1469
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1470
+ }
1471
+
1472
+ _CG_QUALIFIER int all(int predicate) const {
1473
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1474
+ *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
1475
+ };
1476
+ auto inter_warp_lambda =
1477
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1478
+ *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1479
+ };
1480
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1481
+ }
1482
+ };
1483
+
1484
+
1485
+ template <unsigned int Size, typename ParentT = void>
1486
+ class __multi_warp_thread_block_tile :
1487
+ public __static_size_multi_warp_tile_base<Size>,
1488
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1489
+ {
1490
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1491
+ typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
1492
+ protected:
1493
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
1494
+ __static_size_multi_warp_tile_base<Size>(g) {}
1495
+ };
1496
+
1497
+ template <unsigned int Size>
1498
+ class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
1499
+ {
1500
+ const unsigned int metaGroupRank;
1501
+ const unsigned int metaGroupSize;
1502
+
1503
+ protected:
1504
+ template <unsigned int OtherSize, typename ParentT>
1505
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
1506
+ __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
1507
+
1508
+ public:
1509
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1510
+ return metaGroupRank;
1511
+ }
1512
+
1513
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1514
+ return metaGroupSize;
1515
+ }
1516
+ };
1517
+ #endif
1518
+
1519
+ template <unsigned int Size, typename ParentT = void>
1520
+ class thread_block_tile;
1521
+
1522
+ namespace details {
1523
+ template <unsigned int Size, typename ParentT, bool IsMultiWarp>
1524
+ class thread_block_tile_impl;
1525
+
1526
+ template <unsigned int Size, typename ParentT>
1527
+ class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
1528
+ {
1529
+ protected:
1530
+ template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
1531
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
1532
+ __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
1533
+
1534
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
1535
+ __single_warp_thread_block_tile<Size, ParentT>() {}
1536
+ };
1537
+
1538
+ #if defined(_CG_CPP11_FEATURES)
1539
+ template <unsigned int Size, typename ParentT>
1540
+ class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
1541
+ {
1542
+ protected:
1543
+ template <typename GroupT>
1544
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
1545
+ __multi_warp_thread_block_tile<Size, ParentT>(g) {}
1546
+ };
1547
+ #else
1548
+ template <unsigned int Size, typename ParentT>
1549
+ class thread_block_tile_impl<Size, ParentT, true>
1550
+ {
1551
+ protected:
1552
+ template <typename GroupT>
1553
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
1554
+ };
1555
+ #endif
1556
+ }
1557
+
1558
+ template <unsigned int Size, typename ParentT>
1559
+ class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
1560
+ {
1561
+ friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
1562
+
1563
+ protected:
1564
+ _CG_QUALIFIER thread_block_tile(const ParentT& g) :
1565
+ details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
1566
+
1567
+ public:
1568
+ _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
1569
+ return thread_block_tile<Size, void>(*this);
1570
+ }
1571
+ };
1572
+
1573
+ template <unsigned int Size>
1574
+ class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
1575
+ {
1576
+ template <unsigned int, typename ParentT>
1577
+ friend class thread_block_tile;
1578
+
1579
+ protected:
1580
+ template <unsigned int OtherSize, typename OtherParentT>
1581
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
1582
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1583
+
1584
+ public:
1585
+ template <typename ParentT>
1586
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
1587
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1588
+ };
1589
+
1590
+ namespace details {
1591
+ template <unsigned int Size, typename ParentT>
1592
+ struct tiled_partition_impl;
1593
+
1594
+ template <unsigned int Size>
1595
+ struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
1596
+ _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
1597
+ thread_block_tile<Size, thread_block>(g) {}
1598
+ };
1599
+
1600
+ // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
1601
+ template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
1602
+ struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
1603
+ public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
1604
+ #ifdef _CG_CPP11_FEATURES
1605
+ static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
1606
+ #endif
1607
+ _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
1608
+ thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
1609
+ };
1610
+
1611
+ }
1612
+
1613
+ template <unsigned int Size, typename ParentT>
1614
+ _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
1615
+ {
1616
+ return details::tiled_partition_impl<Size, ParentT>(g);
1617
+ }
1618
+
1619
+ /**
1620
+ * thread_group this_thread()
1621
+ *
1622
+ * Constructs a generic thread_group containing only the calling thread
1623
+ */
1624
+ _CG_QUALIFIER thread_block_tile<1, void> this_thread()
1625
+ {
1626
+ // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
1627
+ // meta group rank and size set to 0 and 1 respectively.
1628
+ return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
1629
+ }
1630
+
1631
+ /**
1632
+ * <group_type>.sync()
1633
+ *
1634
+ * Executes a barrier across the group
1635
+ *
1636
+ * Implements both a compiler fence and an architectural fence to prevent,
1637
+ * memory reordering around the barrier.
1638
+ */
1639
+ _CG_QUALIFIER void thread_group::sync() const
1640
+ {
1641
+ switch (_data.group.type) {
1642
+ case details::coalesced_group_id:
1643
+ cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
1644
+ break;
1645
+ case details::thread_block_id:
1646
+ cooperative_groups::sync(*static_cast<const thread_block*>(this));
1647
+ break;
1648
+ case details::grid_group_id:
1649
+ cooperative_groups::sync(*static_cast<const grid_group*>(this));
1650
+ break;
1651
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1652
+ case details::multi_grid_group_id:
1653
+ cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
1654
+ break;
1655
+ #endif
1656
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1657
+ case details::cluster_group_id:
1658
+ cooperative_groups::sync(*static_cast<const cluster_group*>(this));
1659
+ break;
1660
+ #endif
1661
+ default:
1662
+ break;
1663
+ }
1664
+ }
1665
+
1666
+ /**
1667
+ * <group_type>.size()
1668
+ *
1669
+ * Returns the total number of threads in the group.
1670
+ */
1671
+ _CG_QUALIFIER unsigned long long thread_group::size() const
1672
+ {
1673
+ unsigned long long size = 0;
1674
+ switch (_data.group.type) {
1675
+ case details::coalesced_group_id:
1676
+ size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
1677
+ break;
1678
+ case details::thread_block_id:
1679
+ size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
1680
+ break;
1681
+ case details::grid_group_id:
1682
+ size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
1683
+ break;
1684
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1685
+ case details::multi_grid_group_id:
1686
+ size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
1687
+ break;
1688
+ #endif
1689
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1690
+ case details::cluster_group_id:
1691
+ size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
1692
+ break;
1693
+ #endif
1694
+ default:
1695
+ break;
1696
+ }
1697
+ return size;
1698
+ }
1699
+
1700
+ /**
1701
+ * <group_type>.thread_rank()
1702
+ *
1703
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
1704
+ */
1705
+ _CG_QUALIFIER unsigned long long thread_group::thread_rank() const
1706
+ {
1707
+ unsigned long long rank = 0;
1708
+ switch (_data.group.type) {
1709
+ case details::coalesced_group_id:
1710
+ rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
1711
+ break;
1712
+ case details::thread_block_id:
1713
+ rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
1714
+ break;
1715
+ case details::grid_group_id:
1716
+ rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
1717
+ break;
1718
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1719
+ case details::multi_grid_group_id:
1720
+ rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
1721
+ break;
1722
+ #endif
1723
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1724
+ case details::cluster_group_id:
1725
+ rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
1726
+ break;
1727
+ #endif
1728
+ default:
1729
+ break;
1730
+ }
1731
+ return rank;
1732
+ }
1733
+
1734
+ _CG_END_NAMESPACE
1735
+
1736
+ #include <cooperative_groups/details/partitioning.h>
1737
+ #if (!defined(_MSC_VER) || defined(_WIN64))
1738
+ # include <cooperative_groups/details/invoke.h>
1739
+ #endif
1740
+
1741
+ # endif /* ! (__cplusplus, __CUDACC__) */
1742
+
1743
+ #endif /* !_COOPERATIVE_GROUPS_H_ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_ASYNC_H
50
+ #define _CG_ASYNC_H
51
+
52
+ #include "helpers.h"
53
+ #include "info.h"
54
+
55
+ #include <cuda_pipeline.h>
56
+
57
+ _CG_BEGIN_NAMESPACE
58
+
59
+ namespace details {
60
+ // Groups supported by memcpy_async
61
+ template <class TyGroup>
62
+ struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
63
+
64
+ template <unsigned int Sz, typename TyPar>
65
+ struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
66
+ : public _CG_STL_NAMESPACE::true_type {};
67
+ template <>
68
+ struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
69
+ template <>
70
+ struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
71
+
72
+ template <class TyGroup>
73
+ using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
74
+
75
+ // Groups that require optimization
76
+ template <class TyGroup>
77
+ struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
78
+
79
+ template <typename TyPar>
80
+ struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
81
+ : public _CG_STL_NAMESPACE::false_type {};
82
+
83
+ template <unsigned int Sz, typename TyPar>
84
+ struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
85
+ : public _CG_STL_NAMESPACE::true_type {};
86
+
87
+ template <class TyGroup>
88
+ using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
89
+
90
+ // SFINAE helpers for tile optimizations
91
+ template <class TyGroup>
92
+ using enable_tile_optimization =
93
+ typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
94
+
95
+ template <class TyGroup>
96
+ using disable_tile_optimization =
97
+ typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
98
+
99
+ // Segment for punning to aligned types
100
+ template <unsigned int N>
101
+ struct _Segment {
102
+ int _seg[N];
103
+ };
104
+
105
+ // Trivial layout guaranteed-aligned copy-async compatible segments
106
+ template <unsigned int N>
107
+ struct Segment;
108
+ template <>
109
+ struct __align__(4) Segment<1> : public _Segment<1>{};
110
+ template <>
111
+ struct __align__(8) Segment<2> : public _Segment<2>{};
112
+ template <>
113
+ struct __align__(16) Segment<4> : public _Segment<4>{};
114
+
115
+ // Interleaved element by element copies from source to dest
116
+ template <typename TyGroup, typename TyElem>
117
+ _CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
118
+ size_t count) {
119
+ const unsigned int rank = group.thread_rank();
120
+ const unsigned int stride = group.size();
121
+
122
+ for (size_t idx = rank; idx < count; idx += stride) {
123
+ dst[idx] = src[idx];
124
+ }
125
+ }
126
+
127
+ template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
128
+ _CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
129
+ const TyElem *__restrict__ src, size_t count) {
130
+ static_assert(async_copy_group_supported<TyGroup>::value,
131
+ "Async copy is only supported for groups that represent private shared memory");
132
+
133
+ if (count == 0) {
134
+ return;
135
+ }
136
+
137
+ const bool dstIsNotShared = !__isShared(dst);
138
+ const bool srcIsNotGlobal = !__isGlobal(src);
139
+
140
+ if (dstIsNotShared || srcIsNotGlobal) {
141
+ inline_copy(group, dst, src, count);
142
+ return;
143
+ }
144
+
145
+ const unsigned int stride = group.size();
146
+ const unsigned int rank = group.thread_rank();
147
+ // Efficient copies require warps to operate on the same amount of work at each step.
148
+ // remainders are handled in a separate stage to prevent branching
149
+ const unsigned int subWarpMask = (stride - 1);
150
+ const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
151
+ const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
152
+
153
+ const size_t warpCopies = (count & (~subWarpMask));
154
+
155
+ for (size_t idx = 0; idx < warpCopies; idx += stride) {
156
+ size_t _srcIdx = rank + idx;
157
+ size_t _dstIdx = rank + idx;
158
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
159
+ }
160
+
161
+ if (subwarpCopies) {
162
+ size_t _srcIdx = warpCopies + maxSubwarpRank;
163
+ size_t _dstIdx = warpCopies + maxSubwarpRank;
164
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
165
+ }
166
+ }
167
+
168
+ template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
169
+ _CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
170
+ const TyElem *__restrict__ src, size_t count) {
171
+ static_assert(async_copy_group_supported<TyGroup>::value,
172
+ "Async copy is only supported for groups that represent private shared memory");
173
+
174
+ const bool dstIsNotShared = !__isShared(dst);
175
+ const bool srcIsNotGlobal = !__isGlobal(src);
176
+
177
+ if (dstIsNotShared || srcIsNotGlobal) {
178
+ inline_copy(group, dst, src, count);
179
+ return;
180
+ }
181
+
182
+ unsigned int stride = group.size();
183
+ unsigned int rank = group.thread_rank();
184
+
185
+ for (size_t idx = rank; idx < count; idx += stride) {
186
+ size_t _srcIdx = idx;
187
+ size_t _dstIdx = idx;
188
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
189
+ }
190
+ }
191
+
192
+ // Determine best possible alignment given an input and initial conditions
193
+ // Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
194
+ template <unsigned int MinAlignment, unsigned int MaxAlignment>
195
+ _CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
196
+ // Narrowing conversion intentional
197
+ uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
198
+ uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
199
+
200
+ uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
201
+
202
+ // range [MaxAlignment, alignof(elem)], step: x >> 1
203
+ // over range of possible alignments, choose best available out of range
204
+ uint32_t out = MaxAlignment;
205
+ #pragma unroll
206
+ for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
207
+ if (alignment & diff)
208
+ out = alignment;
209
+ }
210
+
211
+ return out;
212
+ }
213
+
214
+ // Determine best possible alignment given an input and initial conditions
215
+ // Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
216
+ template <typename TyType, typename TyGroup>
217
+ _CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
218
+ size_t count) {
219
+ const char *src = reinterpret_cast<const char *>(_src);
220
+ char *dst = reinterpret_cast<char *>(_dst);
221
+
222
+ constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
223
+
224
+ uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
225
+ uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
226
+
227
+ inline_copy(group, dst, src, alignOffset);
228
+ count -= alignOffset;
229
+ src += alignOffset;
230
+ dst += alignOffset;
231
+
232
+ // Copy using the best available alignment, async_copy expects n-datums, not bytes
233
+ size_t asyncCount = count / sizeof(TyType);
234
+ accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
235
+ asyncCount *= sizeof(TyType);
236
+
237
+ count -= asyncCount;
238
+ src += asyncCount;
239
+ dst += asyncCount;
240
+ inline_copy(group, dst, src, count);
241
+ }
242
+
243
+ // We must determine alignment and manually align src/dst ourselves
244
+ template <size_t AlignHint>
245
+ struct _memcpy_async_align_dispatch {
246
+ template <typename TyGroup>
247
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
248
+ uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
249
+
250
+ // Avoid copying the extra bytes if desired copy count is smaller
251
+ alignment = count < alignment ? AlignHint : alignment;
252
+
253
+ switch (alignment) {
254
+ default:
255
+ case 1:
256
+ inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
257
+ break;
258
+ case 2:
259
+ inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
260
+ break;
261
+ case 4:
262
+ copy_like<Segment<1>>(group, dst, src, count);
263
+ break;
264
+ case 8:
265
+ copy_like<Segment<2>>(group, dst, src, count);
266
+ break;
267
+ case 16:
268
+ copy_like<Segment<4>>(group, dst, src, count);
269
+ break;
270
+ }
271
+ }
272
+ };
273
+
274
+ // Specialization for 4 byte alignments
275
+ template <>
276
+ struct _memcpy_async_align_dispatch<4> {
277
+ template <typename TyGroup>
278
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
279
+ size_t count) {
280
+ const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
281
+ Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
282
+
283
+ // Dispatch straight to aligned LDGSTS calls
284
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
285
+ }
286
+ };
287
+
288
+ // Specialization for 8 byte alignments
289
+ template <>
290
+ struct _memcpy_async_align_dispatch<8> {
291
+ template <typename TyGroup>
292
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
293
+ size_t count) {
294
+ const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
295
+ Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
296
+
297
+ // Dispatch straight to aligned LDGSTS calls
298
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
299
+ }
300
+ };
301
+
302
+ // Alignments over 16 are truncated to 16 and bypass alignment
303
+ // This is the highest performing memcpy available
304
+ template <>
305
+ struct _memcpy_async_align_dispatch<16> {
306
+ template <typename TyGroup>
307
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
308
+ size_t count) {
309
+ const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
310
+ Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
311
+
312
+ // Dispatch straight to aligned LDGSTS calls
313
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
314
+ }
315
+ };
316
+
317
+ // byte-wide API
318
+ template <size_t Alignment, class TyGroup>
319
+ _CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
320
+ const void *__restrict__ _src, size_t count) {
321
+ static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
322
+ details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
323
+ }
324
+
325
+ // Internal dispatch APIs
326
+ // These deduce the alignments and sizes necessary to invoke the underlying copy engine
327
+ template <typename Ty>
328
+ using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
329
+
330
+ template <typename Ty>
331
+ using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
332
+
333
+ template <typename Ty>
334
+ using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
335
+
336
+ template <typename Ty>
337
+ using enable_if_integral =
338
+ typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
339
+
340
+ // byte-wide API using aligned_sized_t
341
+ template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
342
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
343
+ const void *__restrict__ _src, const Alignment<Hint> &count) {
344
+ constexpr size_t _align = (Hint > 16) ? 16 : Hint;
345
+
346
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
347
+ }
348
+
349
+ // byte-wide API using type for aligment
350
+ template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
351
+ enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
352
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
353
+ const TyElem *__restrict__ _src, const TySize& count) {
354
+ constexpr size_t _align = (Hint > 16) ? 16 : Hint;
355
+
356
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
357
+ }
358
+
359
+ // byte-wide API with full alignment deduction required
360
+ template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
361
+ enable_if_integral<TySize> = nullptr>
362
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
363
+ const TyElem *__restrict__ _src, const TySize& count) {
364
+ details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
365
+ }
366
+
367
+ // 1d-datum API
368
+ template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
369
+ _CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
370
+ const TyElem *__restrict__ src, const size_t srcCount) {
371
+ constexpr unsigned int _align = Hint;
372
+ const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
373
+
374
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
375
+ }
376
+
377
+ // 1d-datum API using aligned_size_t
378
+ template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
379
+ _CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
380
+ const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
381
+ constexpr unsigned int _align = Hint;
382
+ const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
383
+
384
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
385
+ }
386
+
387
+ } // namespace details
388
+
389
+ /*
390
+ * Group submit batch of async-copy to cover contiguous 1D array
391
+ * and commit that batch to eventually wait for completion.
392
+ */
393
+ template <class TyGroup, typename TyElem, typename TySizeT>
394
+ _CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
395
+ const TySizeT &count) {
396
+ details::_memcpy_async_bytes(group, _dst, _src, count);
397
+ __pipeline_commit();
398
+ }
399
+
400
+ /*
401
+ * Group submit batch of async-copy to cover contiguous 1D array
402
+ * and commit that batch to eventually wait for completion.
403
+ * Object counts are in datum sized chunks, not bytes.
404
+ */
405
+ template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
406
+ _CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
407
+ const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
408
+ details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
409
+ __pipeline_commit();
410
+ }
411
+
412
+ /* Group wait for prior Nth stage of memcpy_async to complete. */
413
+ template <unsigned int Stage, class TyGroup>
414
+ _CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
415
+ __pipeline_wait_prior(Stage);
416
+ group.sync();
417
+ }
418
+
419
+ /* Group wait all previously submitted memcpy_async to complete. */
420
+ template <class TyGroup>
421
+ _CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
422
+ __pipeline_wait_prior(0);
423
+ group.sync();
424
+ }
425
+
426
+ /***************** CG APIs including pipeline are deprecated *****************/
427
+
428
+ /* Group submit batch of async-copy to cover of contiguous 1D array
429
+ to a pipeline and commit the batch*/
430
+ template <class TyGroup, class TyElem>
431
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
432
+ nvcuda::experimental::pipeline &pipe) {
433
+ details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
434
+ pipe.commit();
435
+ }
436
+
437
+ /* Group wait for prior Nth stage of memcpy_async to complete. */
438
+ template <unsigned int Stage, class TyGroup>
439
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
440
+ pipe.wait_prior<Stage>();
441
+ group.sync();
442
+ }
443
+
444
+ /* Group wait for stage-S of memcpy_async to complete. */
445
+ template <class TyGroup>
446
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
447
+ pipe.wait(stage);
448
+ group.sync();
449
+ }
450
+ _CG_END_NAMESPACE
451
+
452
+ #endif // _CG_ASYNC_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_COALESCED_REDUCE_H_
50
+ #define _CG_COALESCED_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "cooperative_groups.h"
55
+ #include "partitioning.h"
56
+ #include "coalesced_scan.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
63
+ _CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
64
+ TyVal&& val,
65
+ TyOp&& op) -> decltype(op(val, val)) {
66
+ auto out = val;
67
+ for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
68
+ out = op(out, group.shfl_xor(out, mask));
69
+ }
70
+
71
+ return out;
72
+ }
73
+
74
+ template <typename TyVal, typename TyOp>
75
+ _CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
76
+ if (group.size() == 32) {
77
+ // Full coalesced group can go through faster path by being treated as a tile of size 32
78
+ auto tile = details::tiled_partition_internal<32, void>();
79
+ return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
80
+ }
81
+ else {
82
+ auto scan_result =
83
+ inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
84
+ unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
85
+ unsigned int last_thread_id = 31 - __clz(group_mask);
86
+ return details::tile::shuffle_dispatch<TyVal>::shfl(
87
+ _CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
88
+ }
89
+ }
90
+
91
+ } // details
92
+
93
+ _CG_END_NAMESPACE
94
+
95
+ #endif // _CG_COALESCED_REDUCE_H_
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_COALESCED_SCAN_H_
50
+ #define _CG_COALESCED_SCAN_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "cooperative_groups.h"
55
+ #include "partitioning.h"
56
+ #include "functional.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <typename TyGroup, typename TyVal, typename TyOp>
63
+ _CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
64
+ auto out = val;
65
+ for (int mask = 1; mask < group.size(); mask <<= 1) {
66
+ auto tmp = group.shfl_up(out, mask);
67
+ if (mask <= group.thread_rank()) {
68
+ out = op(out, tmp);
69
+ }
70
+ }
71
+
72
+ return out;
73
+ }
74
+
75
+ template <typename TyGroup, typename TyVal, typename TyOp>
76
+ _CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
77
+ const unsigned int groupSize = group.size();
78
+ auto out = val;
79
+
80
+ const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
81
+ unsigned int lanemask = details::lanemask32_lt() & mask;
82
+ unsigned int srcLane = details::laneid();
83
+
84
+ const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
85
+ const unsigned int rank = __popc(lanemask);
86
+
87
+ for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
88
+ if (i <= rank) {
89
+ srcLane -= j;
90
+ j = i; /* maximum possible lane */
91
+
92
+ unsigned int begLane = base + rank - i; /* minimum possible lane */
93
+
94
+ /* Next source lane is in the range [ begLane .. srcLane ]
95
+ * If begLane < srcLane then do a binary search.
96
+ */
97
+ while (begLane < srcLane) {
98
+ const unsigned int halfLane = (begLane + srcLane) >> 1;
99
+ const unsigned int halfMask = lanemask >> halfLane;
100
+ const unsigned int d = __popc(halfMask);
101
+ if (d < i) {
102
+ srcLane = halfLane - 1; /* halfLane too large */
103
+ }
104
+ else if ((i < d) || !(halfMask & 0x01)) {
105
+ begLane = halfLane + 1; /* halfLane too small */
106
+ }
107
+ else {
108
+ begLane = srcLane = halfLane; /* happen to hit */
109
+ }
110
+ }
111
+ }
112
+
113
+ auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
114
+ if (i <= rank) {
115
+ out = op(out, tmp);
116
+ }
117
+ }
118
+ return out;
119
+ }
120
+
121
+ template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
122
+ _CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
123
+ TyVal&& val,
124
+ TyOp&& op) -> decltype(op(val, val)) {
125
+ return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
126
+ }
127
+
128
+ template <typename TyVal, typename TyOp>
129
+ _CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
130
+ if (group.size() == 32) {
131
+ return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
132
+ }
133
+ else {
134
+ return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
135
+ }
136
+ }
137
+
138
+ template <bool IntegralOptimized>
139
+ struct scan_choose_convertion;
140
+
141
+ template<>
142
+ struct scan_choose_convertion<true> {
143
+ template <typename TyGroup, typename TyRes, typename TyVal>
144
+ _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
145
+ return result - val;
146
+ }
147
+ };
148
+
149
+ template<>
150
+ struct scan_choose_convertion<false> {
151
+ template <typename TyGroup, typename TyRes, typename TyVal>
152
+ _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
153
+ auto ret = group.shfl_up(result, 1);
154
+ if (group.thread_rank() == 0) {
155
+ return {};
156
+ }
157
+ else {
158
+ return ret;
159
+ }
160
+ }
161
+ };
162
+
163
+ template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
164
+ _CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
165
+ using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
166
+ && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
167
+ return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
168
+ }
169
+
170
+ } // details
171
+
172
+ _CG_END_NAMESPACE
173
+
174
+ #endif // _CG_COALESCED_SCAN_H_
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_DRIVER_API_H
50
+ #define _CG_DRIVER_API_H
51
+
52
+ #include "info.h"
53
+
54
+ _CG_BEGIN_NAMESPACE
55
+
56
+ namespace details {
57
+ template <unsigned int RegId>
58
+ _CG_QUALIFIER unsigned int load_env_reg() {
59
+ // Abort by default
60
+ _CG_ABORT();
61
+ return 0;
62
+ }
63
+
64
+ template <unsigned int HiReg, unsigned int LoReg>
65
+ _CG_QUALIFIER unsigned long long load_env_reg64() {
66
+ unsigned long long registerLo = load_env_reg<LoReg>();
67
+ unsigned long long registerHi = load_env_reg<HiReg>();
68
+
69
+ return (registerHi << 32) | registerLo;
70
+ }
71
+
72
+ // inline PTX for accessing registers requires an immediate for the special reg
73
+ # define LOAD_ENVREG(NUMBER) \
74
+ template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
75
+ unsigned int r; \
76
+ asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
77
+ return r; \
78
+ }
79
+
80
+ // Instantiate loaders for registers used
81
+ LOAD_ENVREG(0);
82
+ LOAD_ENVREG(1);
83
+ LOAD_ENVREG(2);
84
+ # undef LOAD_ENVREG
85
+
86
+ struct grid_workspace {
87
+ unsigned int wsSize;
88
+ unsigned int barrier;
89
+ };
90
+
91
+ _CG_QUALIFIER grid_workspace* get_grid_workspace() {
92
+ unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
93
+ // Interpret the address from envreg 1 and 2 as the driver's grid workspace
94
+ return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
95
+ }
96
+ }
97
+ _CG_END_NAMESPACE
98
+
99
+ #endif // _CG_DRIVER_API_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_FUNCTIONAL_H
50
+ #define _CG_FUNCTIONAL_H
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ #ifdef _CG_USE_CUDA_STL
57
+ # include <cuda/std/functional>
58
+ #endif
59
+
60
+ _CG_BEGIN_NAMESPACE
61
+
62
+ namespace details {
63
+ #ifdef _CG_USE_CUDA_STL
64
+ using cuda::std::plus;
65
+ using cuda::std::bit_and;
66
+ using cuda::std::bit_xor;
67
+ using cuda::std::bit_or;
68
+ #else
69
+ template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
70
+ template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
71
+ template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
72
+ template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
73
+ #endif // _CG_USE_PLATFORM_STL
74
+ } // details
75
+
76
+ template <typename Ty>
77
+ struct plus : public details::plus<Ty> {};
78
+
79
+ template <typename Ty>
80
+ struct less {
81
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
82
+ return (arg2 < arg1) ? arg2 : arg1;
83
+ }
84
+ };
85
+
86
+ template <typename Ty>
87
+ struct greater {
88
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
89
+ return (arg1 < arg2) ? arg2 : arg1;
90
+ }
91
+ };
92
+
93
+ template <typename Ty>
94
+ struct bit_and : public details::bit_and<Ty> {};
95
+
96
+ template <typename Ty>
97
+ struct bit_xor : public details::bit_xor<Ty> {};
98
+
99
+ template <typename Ty>
100
+ struct bit_or : public details::bit_or<Ty> {};
101
+
102
+ #if defined(_CG_HAS_STL_ATOMICS)
103
+ namespace details {
104
+ template <class Ty>
105
+ using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
106
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
107
+
108
+ template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
109
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
110
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
111
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
112
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
113
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
114
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
115
+
116
+ template<typename TyAtomic, typename TyVal, typename TyOp>
117
+ _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
118
+ auto old = atomic.load(cuda::std::memory_order_relaxed);
119
+ while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
120
+ return old;
121
+ }
122
+
123
+ template<typename TyOp>
124
+ struct op_picker;
125
+
126
+ template<typename TyVal>
127
+ struct op_picker<cooperative_groups::plus<TyVal>> {
128
+ template<typename TyAtomic>
129
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
130
+ return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
131
+ }
132
+ };
133
+
134
+ template<typename TyVal>
135
+ struct op_picker<cooperative_groups::less<TyVal>> {
136
+ template<typename TyAtomic>
137
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
138
+ return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
139
+ }
140
+ };
141
+
142
+ template<typename TyVal>
143
+ struct op_picker<cooperative_groups::greater<TyVal>> {
144
+ template<typename TyAtomic>
145
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
146
+ return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
147
+ }
148
+ };
149
+
150
+ template<typename TyVal>
151
+ struct op_picker<cooperative_groups::bit_and<TyVal>> {
152
+ template<typename TyAtomic>
153
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
154
+ return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
155
+ }
156
+ };
157
+
158
+ template<typename TyVal>
159
+ struct op_picker<cooperative_groups::bit_xor<TyVal>> {
160
+ template<typename TyAtomic>
161
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
162
+ return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
163
+ }
164
+ };
165
+
166
+ template<typename TyVal>
167
+ struct op_picker<cooperative_groups::bit_or<TyVal>> {
168
+ template<typename TyAtomic>
169
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
170
+ return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
171
+ }
172
+ };
173
+
174
+ template<bool atomic_supported>
175
+ struct atomic_update_dispatch {};
176
+
177
+ template<>
178
+ struct atomic_update_dispatch<false> {
179
+ template<typename TyAtomic, typename TyVal, typename TyOp>
180
+ _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
181
+ return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
182
+ }
183
+ };
184
+
185
+ template<>
186
+ struct atomic_update_dispatch<true> {
187
+ template<typename TyAtomic, typename TyVal, typename TyOp>
188
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
189
+ using dispatch = op_picker<details::remove_qual<TyOp>>;
190
+
191
+ return dispatch::atomic_update(atomic, val);
192
+ }
193
+ };
194
+
195
+ template<typename TyAtomic, typename TyVal, typename TyOp>
196
+ _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
197
+ using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
198
+
199
+ return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
200
+ }
201
+
202
+ template<typename TyAtomic, typename TyVal>
203
+ _CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
204
+ atomic.store(val, cuda::std::memory_order_relaxed);
205
+ }
206
+ }
207
+ #endif
208
+
209
+ _CG_END_NAMESPACE
210
+
211
+ #endif
212
+ #endif //_CG_FUNCTIONAL_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_HELPERS_H_
50
+ # define _COOPERATIVE_GROUPS_HELPERS_H_
51
+
52
+ #include "info.h"
53
+ #include "sync.h"
54
+
55
+ _CG_BEGIN_NAMESPACE
56
+
57
+ namespace details {
58
+ #ifdef _CG_CPP11_FEATURES
59
+ template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
60
+ # ifdef _CG_HAS_FP16_COLLECTIVE
61
+ template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
62
+ template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
63
+ # endif
64
+ template <typename Ty>
65
+ using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
66
+
67
+ // Non-STL utility templates
68
+ template <typename Ty>
69
+ using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
70
+
71
+ template <typename TyLhs, typename TyRhs>
72
+ using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
73
+ >;
74
+ #endif
75
+
76
+ template <typename TyTrunc>
77
+ _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
78
+ return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
79
+ ((TyTrunc)index.y * nIndex.x) +
80
+ (TyTrunc)index.x;
81
+ }
82
+
83
+ namespace cta {
84
+
85
+ _CG_STATIC_QUALIFIER void sync()
86
+ {
87
+ __barrier_sync(0);
88
+ }
89
+
90
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
91
+ {
92
+ return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
93
+ }
94
+
95
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
96
+ {
97
+ return vec3_to_linear<unsigned int>(threadIdx, blockDim);
98
+ }
99
+
100
+ _CG_STATIC_QUALIFIER dim3 group_index()
101
+ {
102
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
103
+ }
104
+
105
+ _CG_STATIC_QUALIFIER dim3 thread_index()
106
+ {
107
+ return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
108
+ }
109
+
110
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
111
+ {
112
+ return dim3(blockDim.x, blockDim.y, blockDim.z);
113
+ }
114
+
115
+ // Legacy aliases
116
+ _CG_STATIC_QUALIFIER unsigned int size()
117
+ {
118
+ return num_threads();
119
+ }
120
+
121
+ _CG_STATIC_QUALIFIER dim3 block_dim()
122
+ {
123
+ return dim_threads();
124
+ }
125
+
126
+ };
127
+
128
+ class _coalesced_group_data_access {
129
+ public:
130
+ // Retrieve mask of coalesced groups and tiles
131
+ template <typename TyGroup>
132
+ _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
133
+ return group.get_mask();
134
+ }
135
+
136
+ template <typename TyGroup>
137
+ _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
138
+ return TyGroup(mask);
139
+ }
140
+
141
+ template <typename TyGroup>
142
+ _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
143
+ group._data.coalesced.metaGroupRank = mgRank;
144
+ group._data.coalesced.metaGroupSize = mgSize;
145
+ }
146
+ };
147
+
148
+ namespace tile {
149
+ template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
150
+ struct _tile_helpers{
151
+ _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
152
+ _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
153
+ _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
154
+ _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
155
+ };
156
+
157
+ template <unsigned int> struct tile_helpers;
158
+ template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
159
+ template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
160
+ template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
161
+ template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
162
+ template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
163
+ template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
164
+
165
+ #ifdef _CG_CPP11_FEATURES
166
+ namespace shfl {
167
+ /***********************************************************************************
168
+ * Recursively Sliced Shuffle
169
+ * Purpose:
170
+ * Slices an input type a number of times into integral types so that shuffles
171
+ * are well defined
172
+ * Expectations:
173
+ * This object *should not* be used from a reinterpret_cast pointer unless
174
+ * some alignment guarantees can be met. Use a memcpy to guarantee that loads
175
+ * from the integral types stored within are aligned and correct.
176
+ **********************************************************************************/
177
+ template <unsigned int count, bool intSized = (count <= sizeof(int))>
178
+ struct recursive_sliced_shuffle_helper;
179
+
180
+ template <unsigned int count>
181
+ struct recursive_sliced_shuffle_helper<count, true> {
182
+ int val;
183
+
184
+ template <typename TyFn>
185
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
186
+ val = shfl(val);
187
+ }
188
+ };
189
+
190
+ template <unsigned int count>
191
+ struct recursive_sliced_shuffle_helper<count, false> {
192
+ int val;
193
+ recursive_sliced_shuffle_helper<count - sizeof(int)> next;
194
+
195
+ template <typename TyFn>
196
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
197
+ val = shfl(val);
198
+ next.invoke_shuffle(shfl);
199
+ }
200
+ };
201
+ }
202
+
203
+ struct _memory_shuffle {
204
+ template <typename TyElem, typename TyShflFn>
205
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
206
+ static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
207
+ return TyElem{};
208
+ }
209
+
210
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
211
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
212
+ auto shfl = [=](int val) -> int {
213
+ return 0;
214
+ };
215
+
216
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
217
+ }
218
+
219
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
220
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
221
+ auto shfl = [=](int val) -> int {
222
+ return 0;
223
+ };
224
+
225
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
226
+ }
227
+
228
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
229
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
230
+ auto shfl = [=](int val) -> int {
231
+ return 0;
232
+ };
233
+
234
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
235
+ }
236
+
237
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
238
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
239
+ auto shfl = [=](int val) -> int {
240
+ return 0;
241
+ };
242
+
243
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
244
+ }
245
+ };
246
+
247
+ /***********************************************************************************
248
+ * Intrinsic Device Function Shuffle
249
+ * Purpose:
250
+ * Uses a shuffle helper that has characteristics best suited for moving
251
+ * elements between threads
252
+ * Expectations:
253
+ * Object given will be forced into an l-value type so that it can be used
254
+ * with a helper structure that reinterprets the data into intrinsic compatible
255
+ * types
256
+ * Notes:
257
+ * !! TyRet is required so that objects are returned by value and not as
258
+ * dangling references depending on the value category of the passed object
259
+ **********************************************************************************/
260
+ struct _intrinsic_compat_shuffle {
261
+ template <unsigned int count>
262
+ using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
263
+
264
+ template <typename TyElem, typename TyShflFn>
265
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
266
+ static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
267
+ shfl_helper<sizeof(TyElem)> helper;
268
+ memcpy(&helper, &elem, sizeof(TyElem));
269
+ helper.invoke_shuffle(fn);
270
+ memcpy(&elem, &helper, sizeof(TyElem));
271
+ return elem;
272
+ }
273
+
274
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
275
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
276
+ auto shfl = [=](int val) -> int {
277
+ return __shfl_sync(gMask, val, srcRank, threads);
278
+ };
279
+
280
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
281
+ }
282
+
283
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
284
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
285
+ auto shfl = [=](int val) -> int {
286
+ return __shfl_down_sync(gMask, val, delta, threads);
287
+ };
288
+
289
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
290
+ }
291
+
292
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
293
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
294
+ auto shfl = [=](int val) -> int {
295
+ return __shfl_up_sync(gMask, val, delta, threads);
296
+ };
297
+
298
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
299
+ }
300
+
301
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
302
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
303
+ auto shfl = [=](int val) -> int {
304
+ return __shfl_xor_sync(gMask, val, lMask, threads);
305
+ };
306
+
307
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
308
+ }
309
+ };
310
+
311
+ struct _native_shuffle {
312
+ template <typename TyElem>
313
+ _CG_STATIC_QUALIFIER TyElem shfl(
314
+ TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
315
+ return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
316
+ }
317
+
318
+ template <typename TyElem>
319
+ _CG_STATIC_QUALIFIER TyElem shfl_down(
320
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
321
+ return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
322
+ }
323
+
324
+ template <typename TyElem>
325
+ _CG_STATIC_QUALIFIER TyElem shfl_up(
326
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
327
+ return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
328
+ }
329
+
330
+ template <typename TyElem>
331
+ _CG_STATIC_QUALIFIER TyElem shfl_xor(
332
+ TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
333
+ return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
334
+ }
335
+ };
336
+
337
+ // Almost all arithmetic types are supported by native shuffle
338
+ // Vector types are the exception
339
+ template <typename TyElem>
340
+ using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
341
+ bool,
342
+ _CG_STL_NAMESPACE::is_integral<
343
+ remove_qual<TyElem>>::value ||
344
+ details::is_float_or_half<
345
+ remove_qual<TyElem>>::value
346
+ >;
347
+
348
+ constexpr unsigned long long _MemoryShuffleCutoff = 32;
349
+
350
+ template <typename TyElem,
351
+ bool IsNative = use_native_shuffle<TyElem>::value,
352
+ bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
353
+ struct shuffle_dispatch;
354
+
355
+ template <typename TyElem>
356
+ struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
357
+
358
+ template <typename TyElem>
359
+ struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
360
+
361
+ template <typename TyElem>
362
+ struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
363
+
364
+ #endif //_CG_CPP11_FEATURES
365
+ };
366
+
367
+ namespace multi_grid {
368
+ struct multi_grid_functions;
369
+ };
370
+
371
+ namespace grid {
372
+ _CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
373
+ return details::sync_grids_arrive(bar);
374
+ }
375
+
376
+ _CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
377
+ details::sync_grids_wait(token, bar);
378
+ }
379
+
380
+ _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
381
+ unsigned int token = details::sync_grids_arrive(bar);
382
+ details::sync_grids_wait(token, bar);
383
+ }
384
+
385
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks()
386
+ {
387
+ // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
388
+ // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
389
+ return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
390
+ }
391
+
392
+ _CG_STATIC_QUALIFIER unsigned long long num_threads()
393
+ {
394
+ return num_blocks() * cta::num_threads();
395
+ }
396
+
397
+ _CG_STATIC_QUALIFIER unsigned long long block_rank()
398
+ {
399
+ return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
400
+ }
401
+
402
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank()
403
+ {
404
+ return block_rank() * cta::num_threads() + cta::thread_rank();
405
+ }
406
+
407
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
408
+ {
409
+ return dim3(gridDim.x, gridDim.y, gridDim.z);
410
+ }
411
+
412
+ _CG_STATIC_QUALIFIER dim3 block_index()
413
+ {
414
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
415
+ }
416
+
417
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
418
+ {
419
+ return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
420
+ }
421
+
422
+ _CG_STATIC_QUALIFIER dim3 thread_index()
423
+ {
424
+ return dim3(blockIdx.x * blockDim.x + threadIdx.x,
425
+ blockIdx.y * blockDim.y + threadIdx.y,
426
+ blockIdx.z * blockDim.z + threadIdx.z);
427
+ }
428
+
429
+ #if defined(_CG_HAS_CLUSTER_GROUP)
430
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
431
+ return __clusterGridDimInClusters();
432
+ }
433
+
434
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
435
+ const dim3 dimClusters = dim_clusters();
436
+ return dimClusters.x * dimClusters.y * dimClusters.z;
437
+ }
438
+
439
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
440
+ return __clusterIdx();
441
+ }
442
+
443
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
444
+ return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
445
+ }
446
+ #endif
447
+
448
+ // Legacy aliases
449
+ _CG_STATIC_QUALIFIER unsigned long long size()
450
+ {
451
+ return num_threads();
452
+ }
453
+
454
+ _CG_STATIC_QUALIFIER dim3 grid_dim()
455
+ {
456
+ return dim_blocks();
457
+ }
458
+ };
459
+
460
+
461
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
462
+
463
+ namespace multi_grid {
464
+ _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
465
+ {
466
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
467
+ //this function is defined in device runtime library
468
+ //which requires separate compilation mode (__CUDACC_RDC__)
469
+ //or extended whole program mode (__CUDACC_EWP__)
470
+ return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
471
+ #else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
472
+ return 0;
473
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
474
+ }
475
+
476
+ _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
477
+ {
478
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
479
+ //this function is defined in device runtime library
480
+ //which requires separate compilation mode (__CUDACC_RDC__)
481
+ //or extended whole program mode (__CUDACC_EWP__)
482
+ cudaError_t err = cudaCGSynchronize(handle, 0);
483
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
484
+ }
485
+
486
+ _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
487
+ {
488
+ unsigned int numThreads = 0;
489
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
490
+ //this function is defined in device runtime library
491
+ //which requires separate compilation mode (__CUDACC_RDC__)
492
+ //or extended whole program mode (__CUDACC_EWP__)
493
+ cudaCGGetSize(&numThreads, NULL, handle);
494
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
495
+ return numThreads;
496
+ }
497
+
498
+ _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
499
+ {
500
+ unsigned int threadRank = 0;
501
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
502
+ //this function is defined in device runtime library
503
+ //which requires separate compilation mode (__CUDACC_RDC__)
504
+ //or extended whole program mode (__CUDACC_EWP__)
505
+ cudaCGGetRank(&threadRank, NULL, handle);
506
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
507
+ return threadRank;
508
+ }
509
+
510
+ _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
511
+ {
512
+ unsigned int gridRank = 0;
513
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
514
+ //this function is defined in device runtime library
515
+ //which requires separate compilation mode (__CUDACC_RDC__)
516
+ //or extended whole program mode (__CUDACC_EWP__)
517
+ cudaCGGetRank(NULL, &gridRank, handle);
518
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
519
+ return gridRank;
520
+ }
521
+
522
+ _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
523
+ {
524
+ unsigned int numGrids = 0;
525
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
526
+ //this function is defined in device runtime library
527
+ //which requires separate compilation mode (__CUDACC_RDC__)
528
+ //or extended whole program mode (__CUDACC_EWP__)
529
+ cudaCGGetSize(NULL, &numGrids, handle);
530
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
531
+ return numGrids;
532
+ }
533
+
534
+ # ifdef _CG_CPP11_FEATURES
535
+ struct multi_grid_functions {
536
+ decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
537
+ decltype(multi_grid::sync) *sync;
538
+ decltype(multi_grid::size) *size;
539
+ decltype(multi_grid::thread_rank) *thread_rank;
540
+ decltype(multi_grid::grid_rank) *grid_rank;
541
+ decltype(multi_grid::num_grids) *num_grids;
542
+ };
543
+
544
+ template <typename = void>
545
+ _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
546
+ __constant__ static const multi_grid_functions mgf {
547
+ &multi_grid::get_intrinsic_handle,
548
+ &multi_grid::sync,
549
+ &multi_grid::size,
550
+ &multi_grid::thread_rank,
551
+ &multi_grid::grid_rank,
552
+ &multi_grid::num_grids
553
+ };
554
+
555
+ return &mgf;
556
+ }
557
+ # endif
558
+ };
559
+ #endif
560
+
561
+ #if defined(_CG_HAS_CLUSTER_GROUP)
562
+ namespace cluster {
563
+
564
+ _CG_STATIC_QUALIFIER bool isReal()
565
+ {
566
+ return __clusterDimIsSpecified();
567
+ }
568
+
569
+ _CG_STATIC_QUALIFIER void barrier_arrive()
570
+ {
571
+ __cluster_barrier_arrive();
572
+ }
573
+
574
+ _CG_STATIC_QUALIFIER void barrier_wait()
575
+ {
576
+ __cluster_barrier_wait();
577
+ }
578
+
579
+ _CG_STATIC_QUALIFIER void sync()
580
+ {
581
+ barrier_arrive();
582
+ barrier_wait();
583
+ }
584
+
585
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
586
+ {
587
+ return __cluster_query_shared_rank(addr);
588
+ }
589
+
590
+ template <typename T>
591
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
592
+ {
593
+ return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
594
+ }
595
+
596
+ _CG_STATIC_QUALIFIER dim3 block_index()
597
+ {
598
+ return __clusterRelativeBlockIdx();
599
+ }
600
+
601
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
602
+ {
603
+ return __clusterRelativeBlockRank();
604
+ }
605
+
606
+ _CG_STATIC_QUALIFIER dim3 thread_index()
607
+ {
608
+ const dim3 blockIndex = block_index();
609
+ return dim3(blockIndex.x * blockDim.x + threadIdx.x,
610
+ blockIndex.y * blockDim.y + threadIdx.y,
611
+ blockIndex.z * blockDim.z + threadIdx.z);
612
+ }
613
+
614
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
615
+ {
616
+ return block_rank() * cta::num_threads() + cta::thread_rank();
617
+ }
618
+
619
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
620
+ {
621
+ return __clusterDim();
622
+ }
623
+
624
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
625
+ {
626
+ return __clusterSizeInBlocks();
627
+ }
628
+
629
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
630
+ {
631
+ const dim3 dimBlocks = dim_blocks();
632
+ const unsigned int x = dimBlocks.x * blockDim.x;
633
+ const unsigned int y = dimBlocks.y * blockDim.y;
634
+ const unsigned int z = dimBlocks.z * blockDim.z;
635
+ return dim3(x, y, z);
636
+ }
637
+
638
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
639
+ {
640
+ return num_blocks() * cta::num_threads();
641
+ }
642
+
643
+ };
644
+ #endif
645
+
646
+ _CG_STATIC_QUALIFIER unsigned int laneid()
647
+ {
648
+ unsigned int laneid;
649
+ asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
650
+ return laneid;
651
+ }
652
+
653
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
654
+ {
655
+ unsigned int lanemask32_eq;
656
+ asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
657
+ return (lanemask32_eq);
658
+ }
659
+
660
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
661
+ {
662
+ unsigned int lanemask32_lt;
663
+ asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
664
+ return (lanemask32_lt);
665
+ }
666
+
667
+ _CG_STATIC_QUALIFIER void abort()
668
+ {
669
+ _CG_ABORT();
670
+ }
671
+
672
+ template <typename Ty>
673
+ _CG_QUALIFIER void assert_if_not_arithmetic() {
674
+ #ifdef _CG_CPP11_FEATURES
675
+ static_assert(
676
+ _CG_STL_NAMESPACE::is_integral<Ty>::value ||
677
+ details::is_float_or_half<Ty>::value,
678
+ "Error: Ty is neither integer or float"
679
+ );
680
+ #endif //_CG_CPP11_FEATURES
681
+ }
682
+
683
+ #ifdef _CG_CPP11_FEATURES
684
+ _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
685
+ return x == 1 ? 0 : 1 + log2(x / 2);
686
+ }
687
+ #endif //_CG_CPP11_FEATURES
688
+
689
+ }; // !Namespace internal
690
+
691
+ _CG_END_NAMESPACE
692
+
693
+ #endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+
50
+ #include <nv/target>
51
+
52
+ #ifndef _CG_INFO_H_
53
+ #define _CG_INFO_H_
54
+ /*
55
+ ** Define: _CG_VERSION
56
+ */
57
+ #define _CG_VERSION 1000
58
+
59
+ /*
60
+ ** Define: _CG_ABI_VERSION
61
+ */
62
+ #ifndef _CG_ABI_VERSION
63
+ # define _CG_ABI_VERSION 1
64
+ #endif
65
+
66
+ /*
67
+ ** Define: _CG_ABI_EXPERIMENTAL
68
+ ** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
69
+ */
70
+ #if defined(_CG_ABI_EXPERIMENTAL)
71
+ #endif
72
+
73
+ #define _CG_CONCAT_INNER(x, y) x ## y
74
+ #define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
75
+ #define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
76
+
77
+ #define _CG_BEGIN_NAMESPACE \
78
+ namespace cooperative_groups { namespace _CG_NAMESPACE {
79
+ #define _CG_END_NAMESPACE \
80
+ }; using namespace _CG_NAMESPACE; };
81
+
82
+ #if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
83
+ # define _CG_CPP11_FEATURES
84
+ #endif
85
+
86
+ #if !defined(_CG_QUALIFIER)
87
+ # define _CG_QUALIFIER __forceinline__ __device__
88
+ #endif
89
+ #if !defined(_CG_STATIC_QUALIFIER)
90
+ # define _CG_STATIC_QUALIFIER static __forceinline__ __device__
91
+ #endif
92
+ #if !defined(_CG_CONSTEXPR_QUALIFIER)
93
+ # if defined(_CG_CPP11_FEATURES)
94
+ # define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
95
+ # else
96
+ # define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
97
+ # endif
98
+ #endif
99
+ #if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
100
+ # if defined(_CG_CPP11_FEATURES)
101
+ # define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
102
+ # else
103
+ # define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
104
+ # endif
105
+ #endif
106
+
107
+ #if defined(_MSC_VER)
108
+ # define _CG_DEPRECATED __declspec(deprecated)
109
+ #else
110
+ # define _CG_DEPRECATED __attribute__((deprecated))
111
+ #endif
112
+
113
+ #if defined(__CUDA_MINIMUM_ARCH__)
114
+ # define _CG_CUDA_ARCH __CUDA_MINIMUM_ARCH__
115
+ #elif defined(__CUDA_ARCH__)
116
+ # define _CG_CUDA_ARCH __CUDA_ARCH__
117
+ #endif
118
+
119
+ #if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
120
+ # define _CG_HAS_GRID_GROUP
121
+ #endif
122
+ #if (_CG_CUDA_ARCH >= 600) || !defined(_CG_CUDA_ARCH)
123
+ # define _CG_HAS_MULTI_GRID_GROUP
124
+ #endif
125
+ #if (_CG_CUDA_ARCH >= 700) || !defined(_CG_CUDA_ARCH)
126
+ # define _CG_HAS_MATCH_COLLECTIVE
127
+ #endif
128
+
129
+ #if ((_CG_CUDA_ARCH >= 800) || !defined(_CG_CUDA_ARCH)) && !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
130
+ # define _CG_HAS_RESERVED_SHARED
131
+ #endif
132
+
133
+ #if ((_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)) && \
134
+ (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) && \
135
+ defined(_CG_CPP11_FEATURES)
136
+ # define _CG_HAS_CLUSTER_GROUP
137
+ #endif
138
+
139
+ #if (_CG_CUDA_ARCH >= 900) || !defined(_CG_CUDA_ARCH)
140
+ # define _CG_HAS_INSTR_ELECT
141
+ #endif
142
+
143
+ // Has __half and __half2
144
+ // Only usable if you include the cuda_fp16.h extension, and
145
+ // _before_ including cooperative_groups.h
146
+ #ifdef __CUDA_FP16_TYPES_EXIST__
147
+ # define _CG_HAS_FP16_COLLECTIVE
148
+ #endif
149
+
150
+ // Include libcu++ where supported.
151
+ #if defined(_CG_CPP11_FEATURES) && !defined(__ibmxl__) && (!defined(_MSC_VER) || defined(_WIN64)) && \
152
+ !defined(_CG_LIMIT_INCLUDED_DEPENDENCIES)
153
+ # define _CG_USE_CUDA_STL
154
+ #else
155
+ # define _CG_USE_OWN_TRAITS
156
+ #endif
157
+
158
+ #if defined(_CG_USE_CUDA_STL) && !defined(__QNX__) && (!defined(__CUDA_ARCH__) || \
159
+ ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
160
+ # define _CG_HAS_STL_ATOMICS
161
+ #endif
162
+
163
+ #ifdef _CG_CPP11_FEATURES
164
+ // Use cuda::std:: for type_traits
165
+ # if defined(_CG_USE_CUDA_STL)
166
+ # define _CG_STL_NAMESPACE cuda::std
167
+ # include <cuda/std/type_traits>
168
+ // Use CG's implementation of type traits
169
+ # else
170
+ # define _CG_STL_NAMESPACE cooperative_groups::details::templates
171
+ # endif
172
+ #endif
173
+
174
+ #ifdef _CG_CPP11_FEATURES
175
+ # define _CG_STATIC_CONST_DECL static constexpr
176
+ # define _CG_CONST_DECL constexpr
177
+ #else
178
+ # define _CG_STATIC_CONST_DECL static const
179
+ # define _CG_CONST_DECL const
180
+ #endif
181
+
182
+ #if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
183
+ # define _CG_ASM_PTR_CONSTRAINT "r"
184
+ #else
185
+ # define _CG_ASM_PTR_CONSTRAINT "l"
186
+ #endif
187
+
188
+ /*
189
+ ** Define: CG_DEBUG
190
+ ** What: Enables various runtime safety checks
191
+ */
192
+ #if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
193
+ # define _CG_DEBUG
194
+ #endif
195
+
196
+ #if defined(_CG_DEBUG)
197
+ # include <assert.h>
198
+ # define _CG_ASSERT(x) assert((x));
199
+ # define _CG_ABORT() assert(0);
200
+ #else
201
+ # define _CG_ASSERT(x)
202
+ # define _CG_ABORT() __trap();
203
+ #endif
204
+
205
+ _CG_BEGIN_NAMESPACE
206
+
207
+ namespace details {
208
+ _CG_STATIC_CONST_DECL unsigned int default_max_block_size = 1024;
209
+
210
+ #if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
211
+ namespace templates {
212
+
213
+ /**
214
+ * Integral constants
215
+ **/
216
+ template <typename Ty, Ty Val>
217
+ struct integral_constant {
218
+ static constexpr Ty value = Val;
219
+ typedef Ty type;
220
+
221
+ _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
222
+ _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
223
+ };
224
+
225
+ typedef integral_constant<bool, true> true_type;
226
+ typedef integral_constant<bool, false> false_type;
227
+
228
+ /**
229
+ * CV Qualifiers
230
+ **/
231
+ template <class Ty> struct is_lvalue_reference : public details::templates::false_type {};
232
+ template <class Ty> struct is_lvalue_reference<Ty&> : public details::templates::true_type {};
233
+
234
+ template <class Ty> struct remove_reference {typedef Ty type;};
235
+ template <class Ty> struct remove_reference<Ty&> {typedef Ty type;};
236
+ template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
237
+
238
+ template <class Ty>
239
+ using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
240
+
241
+ template <class Ty> struct remove_const {typedef Ty type;};
242
+ template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
243
+
244
+ template <class Ty> struct remove_volatile {typedef Ty type;};
245
+ template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
246
+
247
+ template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
248
+
249
+ template <class Ty>
250
+ using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
251
+
252
+ template <class Ty>
253
+ _CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
254
+ return static_cast<Ty&&>(t);
255
+ }
256
+
257
+ template <class Ty>
258
+ _CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
259
+ static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
260
+ return static_cast<Ty&&>(t);
261
+ }
262
+
263
+ /**
264
+ * is_integral
265
+ **/
266
+ template <class Ty> struct _is_integral : public details::templates::false_type {};
267
+ template <> struct _is_integral<bool> : public details::templates::true_type {};
268
+ template <> struct _is_integral<char> : public details::templates::true_type {};
269
+ template <> struct _is_integral<unsigned char> : public details::templates::true_type {};
270
+ template <> struct _is_integral<short> : public details::templates::true_type {};
271
+ template <> struct _is_integral<unsigned short> : public details::templates::true_type {};
272
+ template <> struct _is_integral<int> : public details::templates::true_type {};
273
+ template <> struct _is_integral<unsigned int> : public details::templates::true_type {};
274
+ template <> struct _is_integral<long> : public details::templates::true_type {};
275
+ template <> struct _is_integral<long long> : public details::templates::true_type {};
276
+ template <> struct _is_integral<unsigned long> : public details::templates::true_type {};
277
+ template <> struct _is_integral<unsigned long long> : public details::templates::true_type {};
278
+ //Vector type support?
279
+
280
+ template <typename Ty>
281
+ struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
282
+
283
+ /**
284
+ * is_floating_point
285
+ **/
286
+ template <class Ty> struct _is_floating_point : public details::templates::false_type {};
287
+ template <> struct _is_floating_point<float> : public details::templates::true_type {};
288
+ template <> struct _is_floating_point<double> : public details::templates::true_type {};
289
+ template <> struct _is_floating_point<long double> : public details::templates::true_type {};
290
+ # ifdef __CUDA_FP16_TYPES_EXIST__
291
+ template <> struct _is_floating_point<__half> : public details::templates::true_type {};
292
+ template <> struct _is_floating_point<__half2> : public details::templates::true_type {};
293
+ # endif
294
+ //Vector type support?
295
+
296
+ template <typename Ty>
297
+ struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
298
+
299
+ template <class T>
300
+ struct is_arithmetic : details::templates::integral_constant<
301
+ bool,
302
+ details::templates::is_integral<T>::value ||
303
+ details::templates::is_floating_point<T>::value> {};
304
+
305
+ template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
306
+ struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
307
+
308
+ template <typename Ty>
309
+ struct _is_unsigned<Ty,false> : details::templates::false_type {};
310
+
311
+ template <typename Ty>
312
+ struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
313
+
314
+ template <typename Ty> struct _is_pointer : public details::templates::false_type {};
315
+ template <typename Ty> struct _is_pointer<Ty*> : public details::templates::true_type {};
316
+
317
+ template <typename Ty>
318
+ struct is_pointer : _is_pointer<typename details::templates::remove_cv<Ty>::type> {};
319
+
320
+ /**
321
+ * programmatic type traits
322
+ **/
323
+ template<bool B, class Ty = void>
324
+ struct enable_if {};
325
+
326
+ template<class Ty>
327
+ struct enable_if<true, Ty> { typedef Ty type; };
328
+
329
+ template<bool Cond, typename Ty = void>
330
+ using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
331
+
332
+ template<class Ty1, class Ty2>
333
+ struct is_same : details::templates::false_type {};
334
+
335
+ template<class Ty>
336
+ struct is_same<Ty, Ty> : details::templates::true_type {};
337
+
338
+ } // templates
339
+ #endif // _CG_CPP11_FEATURES
340
+
341
+ } // details
342
+ _CG_END_NAMESPACE
343
+
344
+
345
+ #endif // _CG_INFO_H_
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CG_INVOKE_H
51
+ #define _CG_INVOKE_H
52
+
53
+ #include "info.h"
54
+ #include "helpers.h"
55
+
56
+ #if defined(_CG_CPP11_FEATURES)
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <typename Group>
63
+ struct _elect_group_supported : _CG_STL_NAMESPACE::false_type {};
64
+ #ifdef _CG_HAS_INSTR_ELECT
65
+ template<>
66
+ struct _elect_group_supported<coalesced_group> : _CG_STL_NAMESPACE::true_type {};
67
+ template<unsigned int Size, typename Parent>
68
+ struct _elect_group_supported<thread_block_tile<Size, Parent>> :
69
+ _CG_STL_NAMESPACE::integral_constant<bool, (Size <= 32)> {};
70
+ #endif
71
+
72
+ template <typename Group>
73
+ struct elect_group_supported : public _elect_group_supported<details::remove_qual<Group>> {};
74
+
75
+ template<typename Group>
76
+ _CG_STATIC_QUALIFIER bool elect_one(const Group& group, unsigned int mask, unsigned int& leader_lane) {
77
+ int is_leader = 0;
78
+ #ifdef _CG_HAS_INSTR_ELECT
79
+ asm("{\n\t"
80
+ " .reg .pred p;\n\t"
81
+ " elect.sync %0|p, %2;\n\t"
82
+ " @p mov.s32 %1, 1;\n\t"
83
+ "}"
84
+ : "+r"(leader_lane), "+r"(is_leader) : "r" (mask));
85
+ #endif
86
+ return is_leader;
87
+ }
88
+
89
+ template<bool UseElect>
90
+ struct invoke_one_impl {};
91
+
92
+ template<>
93
+ struct invoke_one_impl<true> {
94
+ template<typename Group, typename Fn, typename... Args>
95
+ _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
96
+ auto mask = details::_coalesced_group_data_access::get_mask(group);
97
+ unsigned int leader_lane = 0;
98
+
99
+ if (elect_one(group, mask, leader_lane)) {
100
+ _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
101
+ }
102
+ }
103
+
104
+ template<typename Group, typename Fn, typename... Args>
105
+ _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
106
+ -> typename _CG_STL_NAMESPACE::remove_reference<
107
+ decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
108
+
109
+ using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
110
+ details::remove_qual<ResultType> result;
111
+ auto mask = details::_coalesced_group_data_access::get_mask(group);
112
+ unsigned int leader_lane = 0;
113
+
114
+ if (elect_one(group, mask, leader_lane)) {
115
+ result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
116
+ }
117
+
118
+ // Need to use low level api instead of group.shfl, because elect_one returns lane id, not group rank.
119
+ return tile::shuffle_dispatch<ResultType>::shfl(result, mask, leader_lane, 32);
120
+ }
121
+ };
122
+
123
+ template<>
124
+ struct invoke_one_impl<false> {
125
+ template<typename Group, typename Fn, typename... Args>
126
+ _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
127
+ if (group.thread_rank() == 0) {
128
+ _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
129
+ }
130
+ }
131
+
132
+ template<typename Group, typename Fn, typename... Args>
133
+ _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
134
+ -> typename _CG_STL_NAMESPACE::remove_reference<
135
+ decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
136
+
137
+ using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
138
+ details::remove_qual<ResultType> result;
139
+
140
+ if (group.thread_rank() == 0) {
141
+ result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
142
+ }
143
+
144
+ return group.shfl(result, 0);
145
+ }
146
+ };
147
+
148
+
149
+ }; // namespace details
150
+
151
+ template<typename Group, typename Fn, typename... Args>
152
+ _CG_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
153
+ using impl = details::invoke_one_impl<details::elect_group_supported<Group>::value>;
154
+ impl::invoke_one(group, _CG_STL_NAMESPACE::forward<Fn>(fn), _CG_STL_NAMESPACE::forward<Args>(args)...);
155
+ }
156
+
157
+ template<typename Fn, typename... Args>
158
+ _CG_QUALIFIER auto invoke_one_broadcast(const coalesced_group& group, Fn&& fn, Args&&... args)
159
+ -> typename _CG_STL_NAMESPACE::remove_reference<
160
+ decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
161
+
162
+ using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
163
+ static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
164
+ "For invocables returning void invoke_one should be used instead");
165
+ using impl = details::invoke_one_impl<details::elect_group_supported<coalesced_group>::value>;
166
+ return impl::invoke_one_broadcast(group,
167
+ _CG_STL_NAMESPACE::forward<Fn>(fn),
168
+ _CG_STL_NAMESPACE::forward<Args>(args)...);
169
+ }
170
+
171
+ template<unsigned int Size, typename Parent, typename Fn, typename... Args>
172
+ _CG_QUALIFIER auto invoke_one_broadcast(const thread_block_tile<Size, Parent>& group, Fn&& fn, Args&&... args)
173
+ -> typename _CG_STL_NAMESPACE::remove_reference<
174
+ decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
175
+
176
+ using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
177
+ static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
178
+ "For invocables returning void invoke_one should be used instead");
179
+ using impl = details::invoke_one_impl<details::elect_group_supported<thread_block_tile<Size, Parent>>::value>;
180
+ return impl::invoke_one_broadcast(group,
181
+ _CG_STL_NAMESPACE::forward<Fn>(fn),
182
+ _CG_STL_NAMESPACE::forward<Args>(args)...);
183
+ }
184
+
185
+ _CG_END_NAMESPACE
186
+
187
+ #endif //_CG_CPP11_FEATURES
188
+
189
+ #endif // _CG_INVOKE_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_MEMORY_H_
50
+ # define _COOPERATIVE_GROUPS_MEMORY_H_
51
+
52
+ #include "info.h"
53
+
54
+ _CG_BEGIN_NAMESPACE
55
+
56
+ #if defined(_CG_CPP11_FEATURES)
57
+ namespace details {
58
+ _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
59
+
60
+ // Should only be called for SM80+
61
+ _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
62
+ {
63
+ unsigned long long ptr = 0;
64
+ NV_IF_TARGET(NV_PROVIDES_SM_80,
65
+ (asm ("{\n\t"
66
+ " .reg .u32 start;\n\t"
67
+ " .reg .u64 extended;\n\t"
68
+ " mov.u32 start, %%reserved_smem_offset_1;\n\t"
69
+ " cvt.u64.u32 extended, start;\n\t"
70
+ " cvta.shared.u64 %0, extended;\n\t"
71
+ "}"
72
+ : "=l"(ptr));)
73
+ )
74
+ return reinterpret_cast<void*>(ptr);
75
+ }
76
+
77
+ struct multi_warp_scratch {
78
+ // One barrier per possible size of the group.
79
+ _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
80
+ _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
81
+
82
+ using communication_type = unsigned long long;
83
+ _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
84
+
85
+ // Layout of the scratch space:
86
+ barrier_t barriers[memory_barriers_count];
87
+ char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
88
+ communication_type communication_memory[default_max_block_size / 32];
89
+
90
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
91
+ // One slot of collectives memory per warp.
92
+ return scratch_num_reserved_bytes + (unsigned int)sync_memory_size + max_block_size / 32 * (unsigned int)communication_size;
93
+ }
94
+
95
+ _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
96
+ if (thread_rank < memory_barriers_count) {
97
+ barriers[thread_rank] = 0;
98
+ }
99
+ }
100
+ };
101
+
102
+ #if defined(_CG_HAS_RESERVED_SHARED)
103
+ // CG can expect at least 288 bytes available in reserved shared
104
+ static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
105
+ #endif
106
+
107
+ // Make sure the structure can fit into the user provided memory
108
+ static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
109
+ "multi-warp scratch size is too large");
110
+
111
+
112
+ _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
113
+ void *ptr;
114
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
115
+ (ptr = reserved_shared_ptr();)
116
+ ,
117
+ (ptr = user_scratch;)
118
+ )
119
+ return static_cast<multi_warp_scratch*>(ptr);
120
+
121
+ }
122
+
123
+ }
124
+
125
+ template <unsigned int MaxBlockSize = details::default_max_block_size>
126
+ struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
127
+ private:
128
+ #if !defined(_CG_HAS_RESERVED_SHARED)
129
+ char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
130
+ #endif
131
+ };
132
+ #endif
133
+
134
+ _CG_END_NAMESPACE
135
+
136
+ #endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CG_PARTITIONING_H
51
+ #define _CG_PARTITIONING_H
52
+
53
+ #include "info.h"
54
+ #include "helpers.h"
55
+
56
+ _CG_BEGIN_NAMESPACE
57
+
58
+ namespace details {
59
+
60
+ template <typename TyGroup>
61
+ _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
62
+ const unsigned int fullMask = ~0u;
63
+
64
+ unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
65
+ unsigned int predMask = pred ? 0 : fullMask;
66
+ unsigned int setMask = __ballot_sync(thisMask, pred);
67
+
68
+ if (setMask == thisMask || setMask == 0) {
69
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
70
+ _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
71
+ return subTile;
72
+ }
73
+ else {
74
+ unsigned int subMask = thisMask & (setMask ^ predMask);
75
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
76
+ _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
77
+ return subTile;
78
+ }
79
+ }
80
+
81
+ #if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
82
+ template <typename TyPredicate>
83
+ struct _labeled_partition_dispatch {
84
+ template <typename TyGroup>
85
+ _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate pred) {
86
+ unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
87
+ unsigned int subMask = __match_any_sync(thisMask, pred);
88
+ unsigned int laneId = details::laneid();
89
+
90
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
91
+
92
+ int leaderLaneId = __ffs(subMask) - 1;
93
+ bool isLeader = leaderLaneId == laneId;
94
+ unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
95
+
96
+ // Count leaders with lower laneid, that will be the meta rank of this tile
97
+ unsigned int tileRank = __popc(leaderMask & ((1 << leaderLaneId) - 1));
98
+
99
+ _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
100
+
101
+ return subTile;
102
+ }
103
+ };
104
+
105
+ template <>
106
+ struct _labeled_partition_dispatch<bool> {
107
+ template <typename TyGroup>
108
+ _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, bool pred) {
109
+ return _binary_partition(tile, pred);
110
+ }
111
+ };
112
+
113
+ template <typename TyPredicate>
114
+ struct _labeled_partition_dispatch<TyPredicate*> {
115
+ template <typename TyGroup>
116
+ _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate* pred) {
117
+ auto impl = _labeled_partition_dispatch<unsigned long long>();
118
+ return impl(tile, reinterpret_cast<unsigned long long>(pred));
119
+ }
120
+ };
121
+ #endif
122
+ }; // namespace details
123
+
124
+ _CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
125
+ return details::_binary_partition(tile, pred);
126
+ }
127
+
128
+ template <unsigned int Size, typename ParentT>
129
+ _CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
130
+ #ifdef _CG_CPP11_FEATURES
131
+ static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
132
+ #endif
133
+ return details::_binary_partition(tile, pred);
134
+ }
135
+
136
+
137
+ #if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
138
+ template <typename TyPredicate>
139
+ _CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
140
+ static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
141
+ _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
142
+ "labeled_partition predicate must be an integral or pointer type");
143
+ auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
144
+ return dispatch(tile, pred);
145
+ }
146
+
147
+ template <typename TyPredicate, unsigned int Size, typename ParentT>
148
+ _CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
149
+ static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
150
+ _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
151
+ "labeled_partition predicate must be an integral or pointer type");
152
+ static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
153
+ auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
154
+ return dispatch(tile, pred);
155
+ }
156
+ #endif
157
+
158
+ _CG_END_NAMESPACE
159
+
160
+ #endif // _CG_PARTITIONING_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_REDUCE_H_
50
+ #define _CG_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "coalesced_reduce.h"
55
+ #include "functional.h"
56
+ #include "cooperative_groups.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <class Ty>
63
+ using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
64
+ bool,
65
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
66
+
67
+ template <class Ty>
68
+ using redux_is_add_supported = _redux_is_add_supported<Ty>;
69
+
70
+ // A specialization for 64 bit logical operations is possible
71
+ // but for now only accelerate 32 bit bitwise ops
72
+ template <class Ty>
73
+ using redux_is_logical_supported = redux_is_add_supported<Ty>;
74
+
75
+ // Base operator support case
76
+ template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
77
+ template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
78
+ template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
79
+ template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
80
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
81
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
82
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
83
+
84
+ template <class Ty, template <class> class TyOp>
85
+ using redux_op_supported = _redux_op_supported<
86
+ typename details::remove_qual<TyOp<Ty>>,
87
+ Ty>;
88
+
89
+ // Groups smaller than 16 actually have worse performance characteristics when used with redux
90
+ // tiles of size 16 and 32 perform the same or better and have better code generation profiles
91
+ template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
92
+
93
+ template <unsigned int Sz, typename TyPar>
94
+ struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
95
+ bool,
96
+ (Sz >= 16)> {};
97
+ template <unsigned int Sz, typename TyPar>
98
+ struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
99
+ bool,
100
+ (Sz >= 16)> {};
101
+ template <>
102
+ struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
103
+
104
+ template <typename TyGroup>
105
+ using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
106
+
107
+ template <template <class> class TyOp>
108
+ _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
109
+ template <template <class> class TyOp>
110
+ _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
111
+
112
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
113
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
114
+ }
115
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
116
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
117
+ }
118
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
119
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
120
+ }
121
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
122
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_and_sync(mask, val));), return 0;)
123
+ }
124
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
125
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_xor_sync(mask, val));), return 0;)
126
+ }
127
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
128
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return static_cast<int>(__reduce_or_sync(mask, val));), return 0;)
129
+ }
130
+
131
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
132
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_add_sync(mask, val);), return 0;)
133
+ }
134
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
135
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_min_sync(mask, val);), return 0;)
136
+ }
137
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
138
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_max_sync(mask, val);), return 0;)
139
+ }
140
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
141
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_and_sync(mask, val);), return 0;)
142
+ }
143
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
144
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_xor_sync(mask, val);), return 0;)
145
+ }
146
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
147
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __reduce_or_sync(mask, val);), return 0;)
148
+ }
149
+
150
+ template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
151
+ struct _accelerated_op;
152
+
153
+ // Signed type redux intrinsic dispatch
154
+ template <typename TyVal>
155
+ struct _accelerated_op<TyVal, false> {
156
+ template <template <class> class TyOp>
157
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
158
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
159
+ }
160
+ };
161
+
162
+ // Unsigned type redux intrinsic dispatch
163
+ template <typename TyVal>
164
+ struct _accelerated_op<TyVal, true> {
165
+ template <template <class> class TyOp>
166
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
167
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
168
+ }
169
+ };
170
+
171
+ template <typename TyVal>
172
+ using accelerated_op = _accelerated_op<TyVal>;
173
+
174
+
175
+ template <typename TyVal, typename TyFnInput, typename TyGroup>
176
+ class _redux_dispatch {
177
+ template <class Ty, template <class> class TyOp>
178
+ using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
179
+ redux_op_supported<Ty, TyOp>::value &&
180
+ redux_group_optimized<TyGroup>::value>;
181
+
182
+ template <class Ty, template <class> class TyOp>
183
+ using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
184
+
185
+ template <class Ty, template <class> class TyOp>
186
+ using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
187
+
188
+ public:
189
+ // Dispatch to redux if the combination of op and args are supported
190
+ template<
191
+ template <class> class TyOp,
192
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
193
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
194
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
195
+ // Retrieve the mask for the group and dispatch to redux
196
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
197
+ ,
198
+ // Arch does not support redux, fallback to shuffles
199
+ return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
200
+ )
201
+ }
202
+
203
+ template<
204
+ template <class> class TyOp,
205
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
206
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
207
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
208
+ // Retrieve the mask for the group and dispatch to redux
209
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
210
+ ,
211
+ // Arch does not support redux, fallback to shuffles
212
+ return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
213
+ )
214
+ }
215
+
216
+ // Fallback shuffle sync reduction
217
+ template <
218
+ template <class> class TyOp,
219
+ redux_is_not_usable<TyFnInput, TyOp> = nullptr>
220
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
221
+ //Dispatch to fallback shuffle sync accelerated reduction
222
+ return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
223
+ }
224
+
225
+ };
226
+
227
+ // Group support for reduce.
228
+ template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
229
+
230
+ template <unsigned int Sz, typename TyPar>
231
+ struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
232
+ template <unsigned int Sz, typename TyPar>
233
+ struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
234
+ template <>
235
+ struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
236
+
237
+ template <typename TyGroup>
238
+ using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
239
+
240
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
241
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
242
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
243
+
244
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
245
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
246
+ }
247
+
248
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
249
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
250
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
251
+
252
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
253
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
254
+ }
255
+
256
+
257
+ template <typename TyVal, typename TyOp, typename TyGroup>
258
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
259
+ return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
260
+ }
261
+
262
+ template <unsigned int GroupId>
263
+ struct tile_reduce_dispatch;
264
+
265
+ template <>
266
+ struct tile_reduce_dispatch<details::coalesced_group_id> {
267
+ template <typename TyGroup, typename TyVal, typename TyFn>
268
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
269
+ return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
270
+ }
271
+ };
272
+
273
+ #if defined(_CG_CPP11_FEATURES)
274
+ template <>
275
+ struct tile_reduce_dispatch<details::multi_tile_group_id> {
276
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
277
+ _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
278
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
279
+ using TyRet = details::remove_qual<TyVal>;
280
+ const unsigned int num_warps = Size / 32;
281
+
282
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
283
+ *warp_scratch_location =
284
+ details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
285
+ };
286
+ auto inter_warp_lambda =
287
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
288
+ *thread_scratch_location =
289
+ details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
290
+ };
291
+ return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
292
+ }
293
+ };
294
+
295
+ template <unsigned int GroupId>
296
+ struct tile_async_reduce_dispatch;
297
+
298
+ template <>
299
+ struct tile_async_reduce_dispatch<details::coalesced_group_id> {
300
+ template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
301
+ _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
302
+ // Do regular, in group reduction
303
+ auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
304
+
305
+ // One thread stores/updates the destination
306
+ if (group.thread_rank() == 0) {
307
+ res_handler(result);
308
+ }
309
+ }
310
+ };
311
+
312
+ template <>
313
+ struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
314
+ template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
315
+ _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
316
+ using TyVal = remove_qual<TyInputVal>;
317
+ const unsigned int num_warps = TySize / 32;
318
+ details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
319
+ auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
320
+
321
+ // Do in warp reduce
322
+ auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
323
+ *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
324
+
325
+ // Tile of size num_warps from the last warp to arrive does final reduction step
326
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
327
+ auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
328
+ if (subwarp.meta_group_rank() == 0) {
329
+ auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
330
+ auto thread_val = *thread_scratch_location;
331
+ // Release other warps, we read their contribution already.
332
+ subwarp.sync();
333
+ details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
334
+ TyVal result = details::reduce(subwarp, thread_val, op);
335
+ // One thread stores the result or updates the atomic
336
+ if (subwarp.thread_rank() == 0) {
337
+ res_handler(result);
338
+ }
339
+ }
340
+ warp.sync();
341
+ }
342
+ }
343
+ };
344
+ #endif
345
+
346
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
347
+ _CG_QUALIFIER void check_reduce_params() {
348
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
349
+ static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
350
+ };
351
+
352
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
353
+ _CG_QUALIFIER void check_async_reduce_params() {
354
+ check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
355
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
356
+ }
357
+ } // details
358
+
359
+ template <typename TyGroup, typename TyVal, typename TyFn>
360
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
361
+ details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
362
+
363
+ using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
364
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
365
+ }
366
+
367
+ #if defined(_CG_CPP11_FEATURES)
368
+
369
+ # if defined(_CG_HAS_STL_ATOMICS)
370
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
371
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
372
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
373
+ auto update_lambda = [&] (TyVal& result) {
374
+ details::atomic_update(dst, result, op);
375
+ };
376
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
377
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
378
+ }
379
+
380
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
381
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
382
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
383
+ auto update_lambda = [&] (TyVal& result) {
384
+ details::atomic_update(dst, result, op);
385
+ };
386
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
387
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
388
+ }
389
+
390
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
391
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
392
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
393
+ auto store_lambda = [&] (TyVal& result) {
394
+ details::atomic_store(dst, result);
395
+ };
396
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
397
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
398
+ }
399
+
400
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
401
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
402
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
403
+ auto store_lambda = [&] (TyVal& result) {
404
+ details::atomic_store(dst, result);
405
+ };
406
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
407
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
408
+ }
409
+ # endif
410
+
411
+ template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
412
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
413
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
414
+ auto store_lambda = [&] (TyVal& result) {
415
+ *dst = result;
416
+ };
417
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
418
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
419
+ }
420
+ #endif
421
+
422
+ _CG_END_NAMESPACE
423
+
424
+ #endif // _CG_REDUCE_H_
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_SCAN_H_
50
+ #define _CG_SCAN_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "functional.h"
55
+ #include "coalesced_scan.h"
56
+
57
+ _CG_BEGIN_NAMESPACE
58
+
59
+ namespace details {
60
+
61
+ // Group support for scan.
62
+ template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
63
+
64
+ template <unsigned int Sz, typename TyPar>
65
+ struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
66
+ template <unsigned int Sz, typename TyPar>
67
+ struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
68
+ template <>
69
+ struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
70
+
71
+ template <typename TyGroup>
72
+ using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
73
+
74
+ template <bool IsIntegralPlus>
75
+ struct integral_optimized_scan;
76
+
77
+ enum class ScanType { exclusive, inclusive };
78
+
79
+ template <unsigned int GroupId, ScanType TyScan>
80
+ struct scan_dispatch;
81
+
82
+ template <ScanType TyScan>
83
+ struct scan_dispatch<details::coalesced_group_id, TyScan> {
84
+ template <typename TyGroup, typename TyVal, typename TyFn>
85
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
86
+ auto scan_result = coalesced_inclusive_scan(group, val, op);
87
+ if (TyScan == ScanType::exclusive) {
88
+ scan_result = convert_inclusive_to_exclusive(group,
89
+ scan_result,
90
+ _CG_STL_NAMESPACE::forward<TyVal>(val),
91
+ _CG_STL_NAMESPACE::forward<TyFn>(op));
92
+ }
93
+ return scan_result;
94
+ }
95
+ };
96
+
97
+ #if defined(_CG_CPP11_FEATURES)
98
+ template <ScanType TyScan>
99
+ struct scan_dispatch<details::multi_tile_group_id, TyScan> {
100
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
101
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
102
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
103
+ using TyRet = details::remove_qual<TyVal>;
104
+ const unsigned int num_warps = Size / 32;
105
+ // In warp scan result, calculated in warp_lambda
106
+ TyRet warp_scan;
107
+
108
+ // In warp scan, put sum in the warp_scratch_location
109
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
110
+ warp_scan =
111
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
112
+ if (warp.thread_rank() + 1 == warp.size()) {
113
+ *warp_scratch_location = warp_scan;
114
+ }
115
+ if (TyScan == ScanType::exclusive) {
116
+ warp_scan = warp.shfl_up(warp_scan, 1);
117
+ }
118
+ };
119
+
120
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
121
+ // to its in-warp scan result
122
+ auto inter_warp_lambda =
123
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
124
+ auto thread_val = *thread_scratch_location;
125
+ auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
126
+ *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
127
+ };
128
+
129
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
130
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
131
+ return previous_warps_sum;
132
+ }
133
+ if (warpType::meta_group_rank() == 0) {
134
+ return warp_scan;
135
+ }
136
+ else {
137
+ return op(warp_scan, previous_warps_sum);
138
+ }
139
+ }
140
+ };
141
+
142
+ #if defined(_CG_HAS_STL_ATOMICS)
143
+ template <unsigned int GroupId, ScanType TyScan>
144
+ struct scan_update_dispatch;
145
+
146
+ template <ScanType TyScan>
147
+ struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
148
+ template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
149
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
150
+ details::remove_qual<TyVal> old;
151
+
152
+ // Do regular in group scan
153
+ auto scan_result = details::coalesced_inclusive_scan(group, val, op);
154
+
155
+ // Last thread updates the atomic and distributes its old value to other threads
156
+ if (group.thread_rank() == group.size() - 1) {
157
+ old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
158
+ }
159
+ old = group.shfl(old, group.size() - 1);
160
+ if (TyScan == ScanType::exclusive) {
161
+ scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
162
+ }
163
+ scan_result = op(old, scan_result);
164
+ return scan_result;
165
+ }
166
+ };
167
+
168
+ template <ScanType TyScan>
169
+ struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
170
+ template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
171
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
172
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
173
+ using TyRet = details::remove_qual<TyVal>;
174
+ const unsigned int num_warps = Size / 32;
175
+ // In warp scan result, calculated in warp_lambda
176
+ TyRet warp_scan;
177
+
178
+ // In warp scan, put sum in the warp_scratch_location
179
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
180
+ warp_scan =
181
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
182
+ if (warp.thread_rank() + 1 == warp.size()) {
183
+ *warp_scratch_location = warp_scan;
184
+ }
185
+ if (TyScan == ScanType::exclusive) {
186
+ warp_scan = warp.shfl_up(warp_scan, 1);
187
+ }
188
+ };
189
+
190
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
191
+ // to its in-warp scan result
192
+ auto inter_warp_lambda =
193
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
194
+ auto thread_val = *thread_scratch_location;
195
+ auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
196
+ TyRet offset;
197
+ // Single thread does the atomic update with sum of all contributions and reads the old value.
198
+ if (subwarp.thread_rank() == subwarp.size() - 1) {
199
+ offset = details::atomic_update(dst, scan_result, op);
200
+ }
201
+ offset = subwarp.shfl(offset, subwarp.size() - 1);
202
+ scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
203
+ // Add offset read from the atomic to the scanned warp sum.
204
+ // Skipping first thread, since it got defautly constructed value from the conversion,
205
+ // it should just return the offset received from the thread that did the atomic update.
206
+ if (subwarp.thread_rank() != 0) {
207
+ offset = op(scan_result, offset);
208
+ }
209
+ *thread_scratch_location = offset;
210
+ };
211
+
212
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
213
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
214
+ return previous_warps_sum;
215
+ }
216
+ return op(warp_scan, previous_warps_sum);
217
+ }
218
+ };
219
+ #endif
220
+ #endif
221
+
222
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
223
+ _CG_QUALIFIER void check_scan_params() {
224
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
225
+ static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
226
+ }
227
+
228
+ #if defined(_CG_HAS_STL_ATOMICS)
229
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
230
+ _CG_QUALIFIER void check_scan_update_params() {
231
+ check_scan_params<TyGroup, TyInputVal, TyRetVal>();
232
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
233
+ }
234
+ #endif
235
+
236
+ } // details
237
+
238
+ template <typename TyGroup, typename TyVal, typename TyFn>
239
+ _CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
240
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
241
+
242
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
243
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
244
+ }
245
+
246
+ template <typename TyGroup, typename TyVal>
247
+ _CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
248
+ return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
249
+ }
250
+
251
+ template <typename TyGroup, typename TyVal, typename TyFn>
252
+ _CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
253
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
254
+
255
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
256
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
257
+ }
258
+
259
+ template <typename TyGroup, typename TyVal>
260
+ _CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
261
+ return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
262
+ }
263
+
264
+ #if defined(_CG_HAS_STL_ATOMICS)
265
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
266
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
267
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
268
+
269
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
270
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
271
+ }
272
+
273
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
274
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
275
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
276
+ }
277
+
278
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
279
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
280
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
281
+
282
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
283
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
284
+ }
285
+
286
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
287
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
288
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
289
+ }
290
+
291
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
292
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
293
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
294
+
295
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
296
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
297
+ }
298
+
299
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
300
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
301
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
302
+ }
303
+
304
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
305
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
306
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
307
+
308
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
309
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
310
+ }
311
+
312
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
313
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
314
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
315
+ }
316
+ #endif
317
+
318
+ _CG_END_NAMESPACE
319
+
320
+ #endif // _CG_SCAN_H_
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_GRID_H
50
+ #define _CG_GRID_H
51
+
52
+ #include "info.h"
53
+
54
+ _CG_BEGIN_NAMESPACE
55
+
56
+ namespace details
57
+ {
58
+ typedef unsigned int barrier_t;
59
+
60
+ _CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
61
+ return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
62
+ }
63
+
64
+ _CG_STATIC_QUALIFIER bool is_cta_master() {
65
+ return (threadIdx.x + threadIdx.y + threadIdx.z == 0);
66
+ }
67
+
68
+ _CG_STATIC_QUALIFIER unsigned int sync_grids_arrive(volatile barrier_t *arrived) {
69
+ unsigned int oldArrive = 0;
70
+
71
+ __barrier_sync(0);
72
+
73
+ if (is_cta_master()) {
74
+ unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
75
+ bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
76
+ unsigned int nb = 1;
77
+
78
+ if (gpu_master) {
79
+ nb = 0x80000000 - (expected - 1);
80
+ }
81
+
82
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
83
+ // Barrier update with release; polling with acquire
84
+ asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(oldArrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb) : "memory");
85
+ ,
86
+ // Fence; barrier update; volatile polling; fence
87
+ __threadfence();
88
+ oldArrive = atomicAdd((unsigned int*)arrived, nb);
89
+ );
90
+ }
91
+
92
+ return oldArrive;
93
+ }
94
+
95
+
96
+ _CG_STATIC_QUALIFIER void sync_grids_wait(unsigned int oldArrive, volatile barrier_t *arrived) {
97
+ if (is_cta_master()) {
98
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
99
+ unsigned int current_arrive;
100
+ do {
101
+ asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(current_arrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int *)arrived) : "memory");
102
+ } while (!bar_has_flipped(oldArrive, current_arrive));
103
+ ,
104
+ while (!bar_has_flipped(oldArrive, *arrived));
105
+ __threadfence();
106
+ );
107
+ }
108
+
109
+ __barrier_sync(0);
110
+ }
111
+
112
+ /* - Multi warp groups synchronization routines - */
113
+
114
+ #ifdef _CG_CPP11_FEATURES
115
+ // Need both acquire and release for the last warp, since it won't be able to acquire with red.and
116
+ _CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
117
+ unsigned int old;
118
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
119
+ (asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
120
+ ,
121
+ (__threadfence_block();
122
+ old = atomicOr(addr, val);)
123
+ );
124
+ return old;
125
+ }
126
+
127
+ // Special case where barrier is arrived, but not waited on
128
+ _CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
129
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
130
+ (asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
131
+ ,
132
+ (__threadfence_block();
133
+ atomicOr(addr, val);)
134
+ );
135
+ }
136
+
137
+ // Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
138
+ _CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
139
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
140
+ (asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
141
+ ,
142
+ (atomicAnd(addr, val);)
143
+ );
144
+ }
145
+
146
+ // Special case of release, where last warp was doing extra work before releasing others, need to be release
147
+ // to ensure that extra work is visible
148
+ _CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
149
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
150
+ (asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
151
+ ,
152
+ (__threadfence_block();
153
+ atomicAnd(addr, val);)
154
+ );
155
+ }
156
+
157
+ // Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
158
+ _CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
159
+ unsigned int val;
160
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
161
+ (asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");)
162
+ ,
163
+ (val = *((volatile unsigned int*) addr);
164
+ __threadfence_block();)
165
+ );
166
+ return val;
167
+ }
168
+
169
+ // Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
170
+ // thread ranks 32..63 second etc
171
+ // Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions
172
+ _CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
173
+ return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
174
+ }
175
+
176
+ _CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
177
+ while(ld_acquire_cta(arrived) & warp_bit);
178
+ }
179
+
180
+ // Default blocking sync.
181
+ _CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
182
+ unsigned int warp_id = thread_rank / 32;
183
+ bool warp_master = (thread_rank % 32 == 0);
184
+ unsigned int warp_bit = 1 << warp_id;
185
+ unsigned int group_mask = get_group_mask(thread_rank, num_warps);
186
+
187
+ __syncwarp(0xFFFFFFFF);
188
+
189
+ if (warp_master) {
190
+ unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
191
+ if (((old | warp_bit) & group_mask) == group_mask) {
192
+ red_and_relaxed_cta(arrived, ~group_mask);
193
+ }
194
+ else {
195
+ barrier_wait(arrived, warp_bit);
196
+ }
197
+ }
198
+
199
+ __syncwarp(0xFFFFFFFF);
200
+ }
201
+
202
+ // Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
203
+ // Warp returning true from this function needs to call sync_warps_release.
204
+ _CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
205
+ unsigned int warp_id = thread_rank / 32;
206
+ bool warp_master = (thread_rank % 32 == 0);
207
+ unsigned int warp_bit = 1 << warp_id;
208
+ unsigned int group_mask = get_group_mask(thread_rank, num_warps);
209
+
210
+ __syncwarp(0xFFFFFFFF);
211
+
212
+ unsigned int old = 0;
213
+ if (warp_master) {
214
+ old = atom_or_acq_rel_cta(arrived, warp_bit);
215
+ }
216
+ old = __shfl_sync(0xFFFFFFFF, old, 0);
217
+ if (((old | warp_bit) & group_mask) == group_mask) {
218
+ return true;
219
+ }
220
+ barrier_wait(arrived, warp_bit);
221
+
222
+ return false;
223
+ }
224
+
225
+ // Release my group from the barrier.
226
+ _CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
227
+ unsigned int group_mask = get_group_mask(thread_rank, num_warps);
228
+ if (is_master) {
229
+ red_and_release_cta(arrived, ~group_mask);
230
+ }
231
+ }
232
+
233
+ // Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
234
+ // sync_warps_release needs to be called by some warp after this one to reset the barrier.
235
+ _CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
236
+ unsigned int warp_id = thread_rank / 32;
237
+ bool warp_master = (thread_rank % 32 == 0);
238
+ unsigned int warp_bit = 1 << warp_id;
239
+ unsigned int group_mask = get_group_mask(thread_rank, num_warps);
240
+
241
+ __syncwarp(0xFFFFFFFF);
242
+
243
+ if (warp_master) {
244
+ red_or_release_cta(arrived, warp_bit);
245
+ }
246
+ }
247
+
248
+ // Wait for my warp to be released from the barrier. Warp must have arrived first.
249
+ _CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
250
+ unsigned int warp_id = thread_rank / 32;
251
+ unsigned int warp_bit = 1 << warp_id;
252
+
253
+ barrier_wait(arrived, warp_bit);
254
+ }
255
+
256
+ // Wait for specific warp to arrive at the barrier
257
+ _CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
258
+ unsigned int wait_mask = 1 << wait_warp_id;
259
+ while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
260
+ }
261
+
262
+ // Initialize the bit corresponding to my warp in the barrier
263
+ _CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
264
+ unsigned int warp_id = thread_rank / 32;
265
+ unsigned int warp_bit = 1 << warp_id;
266
+
267
+ __syncwarp(0xFFFFFFFF);
268
+
269
+ if (thread_rank % 32 == 0) {
270
+ red_and_release_cta(arrived, ~warp_bit);
271
+ }
272
+ // No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
273
+ }
274
+
275
+ #endif
276
+
277
+ } // details
278
+
279
+ _CG_END_NAMESPACE
280
+
281
+ #endif // _CG_GRID_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
50
+ #define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/async.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+ #endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_REDUCE_H
50
+ #define _COOPERATIVE_GROUPS_REDUCE_H
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/reduce.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+
63
+ #endif //_COOPERATIVE_GROUPS_REDUCE_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia/cuda_runtime/include/cooperative_groups/scan.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_SCAN_H
50
+ #define _COOPERATIVE_GROUPS_SCAN_H
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/scan.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+
63
+ #endif //_COOPERATIVE_GROUPS_SCAN_H