koichi12 commited on Feb 12, 2025

Commit

211e5eb

verified ·

1 Parent(s): dbf954e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/LICENSE +29 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/METADATA +167 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_mpmath.py +7 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_str.py +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_visualization.py +32 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer_v8.h +658 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train_v8.h +540 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v8.h +600 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_train.h +219 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_train_v8.h +219 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train_v8.h +501 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_v8.h +78 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v8.h +70 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h +1665 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h +516 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h +751 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h +498 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp.h +923 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/locators.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/markers.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/metadata.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/resources.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/scripts.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/util.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/version.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/wheel.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distro/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distro/py.typed +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_emoji_codes.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_wrap.py +93 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/constrain.py +37 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/file_proxy.py +57 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/highlighter.py +232 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/json.py +139 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/layout.py +442 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/progress_bar.py +223 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/syntax.py +958 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/tree.py +249 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/INSTALLER +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/METADATA +220 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/__init__.py +36 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/__init__.py +19 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py +180 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/LICENSE ADDED Viewed

	@@ -0,0 +1,29 @@

+BSD 3-Clause License
+Copyright (c) 2018, Martin Durant
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,167 @@

+Metadata-Version: 2.1
+Name: fsspec
+Version: 2024.2.0
+Summary: File-system specification
+Home-page: https://github.com/fsspec/filesystem_spec
+Maintainer: Martin Durant
+Maintainer-email: mdurant@anaconda.com
+License: BSD
+Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
+Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
+Keywords: file
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Provides-Extra: abfs
+Requires-Dist: adlfs ; extra == 'abfs'
+Provides-Extra: adl
+Requires-Dist: adlfs ; extra == 'adl'
+Provides-Extra: arrow
+Requires-Dist: pyarrow >=1 ; extra == 'arrow'
+Provides-Extra: dask
+Requires-Dist: dask ; extra == 'dask'
+Requires-Dist: distributed ; extra == 'dask'
+Provides-Extra: devel
+Requires-Dist: pytest ; extra == 'devel'
+Requires-Dist: pytest-cov ; extra == 'devel'
+Provides-Extra: dropbox
+Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
+Requires-Dist: requests ; extra == 'dropbox'
+Requires-Dist: dropbox ; extra == 'dropbox'
+Provides-Extra: entrypoints
+Provides-Extra: full
+Requires-Dist: adlfs ; extra == 'full'
+Requires-Dist: aiohttp !=4.0.0a0,!=4.0.0a1 ; extra == 'full'
+Requires-Dist: dask ; extra == 'full'
+Requires-Dist: distributed ; extra == 'full'
+Requires-Dist: dropbox ; extra == 'full'
+Requires-Dist: dropboxdrivefs ; extra == 'full'
+Requires-Dist: fusepy ; extra == 'full'
+Requires-Dist: gcsfs ; extra == 'full'
+Requires-Dist: libarchive-c ; extra == 'full'
+Requires-Dist: ocifs ; extra == 'full'
+Requires-Dist: panel ; extra == 'full'
+Requires-Dist: paramiko ; extra == 'full'
+Requires-Dist: pyarrow >=1 ; extra == 'full'
+Requires-Dist: pygit2 ; extra == 'full'
+Requires-Dist: requests ; extra == 'full'
+Requires-Dist: s3fs ; extra == 'full'
+Requires-Dist: smbprotocol ; extra == 'full'
+Requires-Dist: tqdm ; extra == 'full'
+Provides-Extra: fuse
+Requires-Dist: fusepy ; extra == 'fuse'
+Provides-Extra: gcs
+Requires-Dist: gcsfs ; extra == 'gcs'
+Provides-Extra: git
+Requires-Dist: pygit2 ; extra == 'git'
+Provides-Extra: github
+Requires-Dist: requests ; extra == 'github'
+Provides-Extra: gs
+Requires-Dist: gcsfs ; extra == 'gs'
+Provides-Extra: gui
+Requires-Dist: panel ; extra == 'gui'
+Provides-Extra: hdfs
+Requires-Dist: pyarrow >=1 ; extra == 'hdfs'
+Provides-Extra: http
+Requires-Dist: aiohttp !=4.0.0a0,!=4.0.0a1 ; extra == 'http'
+Provides-Extra: libarchive
+Requires-Dist: libarchive-c ; extra == 'libarchive'
+Provides-Extra: oci
+Requires-Dist: ocifs ; extra == 'oci'
+Provides-Extra: s3
+Requires-Dist: s3fs ; extra == 's3'
+Provides-Extra: sftp
+Requires-Dist: paramiko ; extra == 'sftp'
+Provides-Extra: smb
+Requires-Dist: smbprotocol ; extra == 'smb'
+Provides-Extra: ssh
+Requires-Dist: paramiko ; extra == 'ssh'
+Provides-Extra: tqdm
+Requires-Dist: tqdm ; extra == 'tqdm'
+# filesystem_spec
+[![PyPI version](https://badge.fury.io/py/fsspec.svg)](https://pypi.python.org/pypi/fsspec/)
+[![Anaconda-Server Badge](https://anaconda.org/conda-forge/fsspec/badges/version.svg)](https://anaconda.org/conda-forge/fsspec)
+![Build](https://github.com/fsspec/filesystem_spec/workflows/CI/badge.svg)
+[![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
+[![PyPi downloads](https://img.shields.io/pypi/dm/fsspec?label=pypi%20downloads&style=flat)](https://pepy.tech/project/fsspec)
+A specification for pythonic filesystems.
+## Install
+```bash
+pip install fsspec
+```
+would install the base fsspec. Various optionally supported features might require specification of custom
+extra require, e.g. `pip install fsspec[ssh]` will install dependencies for `ssh` backends support.
+Use `pip install fsspec[full]` for installation of all known extra dependencies.
+Up-to-date package also provided through conda-forge distribution:
+```bash
+conda install -c conda-forge fsspec
+```
+## Purpose
+To produce a template or specification for a file-system interface, that specific implementations should follow,
+so that applications making use of them can rely on a common behaviour and not have to worry about the specific
+internal implementation decisions with any given backend. Many such implementations are included in this package,
+or in sister projects such as `s3fs` and `gcsfs`.
+In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE
+mounting of the file-system implementation may be available for all implementations "for free".
+## Documentation
+Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
+## Develop
+fsspec uses GitHub Actions for CI. Environment files can be found
+in the "ci/" directory. Note that the main environment is called "py38",
+but it is expected that the version of python installed be adjustable at
+CI runtime. For local use, pick a version suitable for you.
+### Testing
+Tests can be run in the dev environment, if activated, via ``pytest fsspec``.
+The full fsspec suite requires a system-level docker, docker-compose, and fuse
+installation. If only making changes to one backend implementation, it is
+not generally necessary to run all tests locally.
+It is expected that contributors ensure that any change to fsspec does not
+cause issues or regressions for either other fsspec-related packages such
+as gcsfs and s3fs, nor for downstream users of fsspec. The "downstream" CI
+run and corresponding environment file run a set of tests from the dask
+test suite, and very minimal tests against pandas and zarr from the
+test_downstream.py module in this repo.
+### Code Formatting
+fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure
+a consistent code format throughout the project.
+Run ``black fsspec`` from the root of the filesystem_spec repository to
+auto-format your code. Additionally, many editors have plugins that will apply
+``black`` as you edit files. ``black`` is included in the ``tox`` environments.
+Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to
+automatically run ``black`` when you make a git commit.
+Run ``pre-commit install --install-hooks`` from the root of the
+filesystem_spec repository to setup pre-commit hooks. ``black`` will now be run
+before you commit, reformatting any changed files. You can format without
+committing via ``pre-commit run`` or skip these checks with ``git commit
+--no-verify``.

tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_mpmath.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mpmath.libmp import *
+from mpmath import *
+def test_newstyle_classes():
+    for cls in [mp, fp, iv, mpf, mpc]:
+        for s in cls.__class__.__mro__:
+            assert isinstance(s, type)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_str.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from mpmath import nstr, matrix, inf
+def test_nstr():
+    m = matrix([[0.75, 0.190940654, -0.0299195971],
+                [0.190940654, 0.65625, 0.205663228],
+                [-0.0299195971, 0.205663228, 0.64453125e-20]])
+    assert nstr(m, 4, min_fixed=-inf) == \
+    '''[    0.75  0.1909                    -0.02992]
+[  0.1909  0.6563                      0.2057]
+[-0.02992  0.2057  0.000000000000000000006445]'''
+    assert nstr(m, 4) == \
+    '''[    0.75  0.1909   -0.02992]
+[  0.1909  0.6563     0.2057]
+[-0.02992  0.2057  6.445e-21]'''

tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/test_visualization.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Limited tests of the visualization module. Right now it just makes
+sure that passing custom Axes works.
+"""
+from mpmath import mp, fp
+def test_axes():
+    try:
+        import matplotlib
+        version = matplotlib.__version__.split("-")[0]
+        version = version.split(".")[:2]
+        if [int(_) for _ in version] < [0,99]:
+            raise ImportError
+        import pylab
+    except ImportError:
+        print("\nSkipping test (pylab not available or too old version)\n")
+        return
+    fig = pylab.figure()
+    axes = fig.add_subplot(111)
+    for ctx in [mp, fp]:
+        ctx.plot(lambda x: x**2, [0, 3], axes=axes)
+        assert axes.get_xlabel() == 'x'
+        assert axes.get_ylabel() == 'f(x)'
+    fig = pylab.figure()
+    axes = fig.add_subplot(111)
+    for ctx in [mp, fp]:
+        ctx.cplot(lambda z: z, [-2, 2], [-10, 10], axes=axes)
+    assert axes.get_xlabel() == 'Re(z)'
+    assert axes.get_ylabel() == 'Im(z)'

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer_v8.h ADDED Viewed

	@@ -0,0 +1,658 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv_infer : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_INFER_H_)
+#define CUDNN_ADV_INFER_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_INFER_MAJOR 8
+#define CUDNN_ADV_INFER_MINOR 7
+#define CUDNN_ADV_INFER_PATCH 0
+#if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* BASIC RNN API */
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnRNNPaddingMode_t;
+/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+struct cudnnPersistentRNNPlan;
+typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t;
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision
+ * compute precision is further modified by cudnnSetRNNMatrixMathType()
+ * dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage
+ * dropout is between RNN layers, not between recurrent steps
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t mathPrec);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         int *hiddenSize,
+                         int *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDirectionMode_t *direction,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnDataType_t *mathPrec);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize);
+/* Expensive. Creates the plan for the specific settings. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan);
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+/* dataType in weight descriptors and input descriptors is used to describe storage */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workSpace,
+                         size_t workSpaceSizeInBytes);
+/* RNN EX API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+/* RNN FIND API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes);
+/* Sequence data descriptor */
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t;
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+/* Multihead Attention */
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnAttnQueryMap_t;
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvInferVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_INFER_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train_v8.h ADDED Viewed

	@@ -0,0 +1,540 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv_train : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_TRAIN_H_)
+#define CUDNN_ADV_TRAIN_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_adv_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_TRAIN_MAJOR 8
+#define CUDNN_ADV_TRAIN_MINOR 7
+#define CUDNN_ADV_TRAIN_PATCH 0
+#if (CUDNN_ADV_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_ADV_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV TRAIN!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workSpace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+/* RNN EX API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes);
+/* RNN FIND API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+/*
+* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+*/
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                      /* labels, in CPU memory */
+    const int hostLabelLengths[],                /* the length of each label, in CPU memory */
+    const int hostInputLengths[],                /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int labels[],                          /* labels, in GPU memory */
+    const int labelLengths[],                    /* the length of each label, in GPU memory */
+    const int inputLengths[],                    /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvTrainVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_TRAIN_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v8.h ADDED Viewed

	@@ -0,0 +1,600 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+/*
+ * The content in this header file is under development to be included in cudnn.h in the future
+ * Production code should have all include of this header file remove.
+ */
+#include "cudnn_ops_infer.h"
+#include "cudnn_cnn_infer.h"
+/* NOTE: definition in extern "C" to be copied later to public header */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef void *cudnnBackendDescriptor_t;
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+    CUDNN_POINTWISE_ABS      = 10,
+    CUDNN_POINTWISE_CEIL     = 11,
+    CUDNN_POINTWISE_COS      = 12,
+    CUDNN_POINTWISE_EXP      = 13,
+    CUDNN_POINTWISE_FLOOR    = 14,
+    CUDNN_POINTWISE_LOG      = 15,
+    CUDNN_POINTWISE_NEG      = 16,
+    CUDNN_POINTWISE_RSQRT    = 17,
+    CUDNN_POINTWISE_SIN      = 18,
+    CUDNN_POINTWISE_SQRT     = 4,
+    CUDNN_POINTWISE_TAN      = 19,
+    CUDNN_POINTWISE_ERF      = 20,
+    CUDNN_POINTWISE_IDENTITY = 21,
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM,
+    CUDNN_RNG_DISTRIBUTION_NORMAL,
+} cudnnRngDistribution_t;
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION       = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                  = 9,
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINECFG_ENGINE            = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES      = 302,
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE                     = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG              = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE             = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION        = 405,
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE              = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                 = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                           = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                           = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                           = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                            = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC            = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC            = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC            = 1527,
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC   = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC   = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA   = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA    = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC    = 1716,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC  = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC  = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA   = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA    = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC    = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC   = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC   = 1727,
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+    CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED  = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC  = 2312,
+} cudnnBackendAttributeName_t;
+typedef enum {
+    CUDNN_TYPE_HANDLE = 0,
+    CUDNN_TYPE_DATA_TYPE,
+    CUDNN_TYPE_BOOLEAN,
+    CUDNN_TYPE_INT64,
+    CUDNN_TYPE_FLOAT,
+    CUDNN_TYPE_DOUBLE,
+    CUDNN_TYPE_VOID_PTR,
+    CUDNN_TYPE_CONVOLUTION_MODE,
+    CUDNN_TYPE_HEUR_MODE,
+    CUDNN_TYPE_KNOB_TYPE,
+    CUDNN_TYPE_NAN_PROPOGATION,
+    CUDNN_TYPE_NUMERICAL_NOTE,
+    CUDNN_TYPE_LAYOUT_TYPE,
+    CUDNN_TYPE_ATTRIB_NAME,
+    CUDNN_TYPE_POINTWISE_MODE,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR,
+    CUDNN_TYPE_GENSTATS_MODE,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+    CUDNN_TYPE_BEHAVIOR_NOTE,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE,
+    CUDNN_TYPE_RESAMPLE_MODE,
+    CUDNN_TYPE_PADDING_MODE,
+    CUDNN_TYPE_INT32,
+    CUDNN_TYPE_CHAR,
+    CUDNN_TYPE_SIGNAL_MODE,
+    CUDNN_TYPE_FRACTION,
+    CUDNN_TYPE_NORM_MODE,
+    CUDNN_TYPE_NORM_FWD_PHASE,
+    CUDNN_TYPE_RNG_DISTRIBUTION
+} cudnnBackendAttributeType_t;
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
+    CUDNN_BACKEND_RNG_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR
+} cudnnBackendDescriptorType_t;
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
+    CUDNN_NUMERICAL_NOTE_FFT,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+} cudnnBackendNumericalNote_t;
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+} cudnnBackendBehaviorNote_t;
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE          = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE        = 2,
+    CUDNN_KNOB_TYPE_USE_TEX          = 3,
+    CUDNN_KNOB_TYPE_EDGE             = 4,
+    CUDNN_KNOB_TYPE_KBLOCK           = 5,
+    CUDNN_KNOB_TYPE_LDGA             = 6,
+    CUDNN_KNOB_TYPE_LDGB             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY         = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF      = 12,
+    CUDNN_KNOB_TYPE_TILEK            = 13,
+    CUDNN_KNOB_TYPE_STAGES           = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE   = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC      = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE         = 18,
+    CUDNN_KNOB_TYPE_SLICED           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER     = 21,
+    CUDNN_KNOB_TYPE_LDGC             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT         = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG       = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE        = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M       = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N       = 28,
+    CUDNN_KNOB_TYPE_COUNTS,
+} cudnnBackendKnobType_t;
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE    = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32 = 1,
+} cudnnBackendTensorReordering_t;
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+typedef enum {
+    CUDNN_LAYER_NORM    = 0,
+    CUDNN_INSTANCE_NORM = 1,
+    CUDNN_BATCH_NORM    = 2,
+    CUDNN_GROUP_NORM    = 3,
+} cudnnBackendNormMode_t;
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* _CUDNN_BACKEND_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_train.h ADDED Viewed

	@@ -0,0 +1,219 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_cnn_train : cuDNN's basic definitions and inference CNN functions.
+ */
+#pragma once
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_cnn_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_TRAIN_MAJOR 8
+#define CUDNN_CNN_TRAIN_MINOR 7
+#define CUDNN_CNN_TRAIN_PATCH 0
+#if (CUDNN_CNN_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_CNN_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+/* Function to compute the bias gradient for batch convolution */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnTrainVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_train_v8.h ADDED Viewed

	@@ -0,0 +1,219 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_cnn_train : cuDNN's basic definitions and inference CNN functions.
+ */
+#pragma once
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_cnn_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_TRAIN_MAJOR 8
+#define CUDNN_CNN_TRAIN_MINOR 7
+#define CUDNN_CNN_TRAIN_PATCH 0
+#if (CUDNN_CNN_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_CNN_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+/* Function to compute the bias gradient for batch convolution */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnTrainVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train_v8.h ADDED Viewed

	@@ -0,0 +1,501 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_ops_train : cuDNN's basic training operations and algorithms.
+ */
+#if !defined(CUDNN_OPS_TRAIN_H_)
+#define CUDNN_OPS_TRAIN_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_TRAIN_MAJOR 8
+#define CUDNN_OPS_TRAIN_MINOR 7
+#define CUDNN_OPS_TRAIN_PATCH 0
+#if (CUDNN_OPS_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_OPS_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS TRAIN!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward pooling */
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward activation  */
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+* bnScale gradient and bnBias gradient */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsTrainVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_OPS_TRAIN_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_v8.h ADDED Viewed

	@@ -0,0 +1,78 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn : Neural Networks Library
+*/
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_adv_infer.h"
+#include "cudnn_adv_train.h"
+#include "cudnn_cnn_infer.h"
+#include "cudnn_cnn_train.h"
+#include "cudnn_backend.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v8.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * \file: The master cuDNN version file.
+ */
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+#define CUDNN_MAJOR 8
+#define CUDNN_MINOR 7
+#define CUDNN_PATCHLEVEL 0
+#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+#define CUDNN_MAX_SM_MAJOR_NUMBER 9
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100) + (CUDNN_MAX_SM_MINOR_NUMBER * 10)
+#endif /* CUDNN_VERSION_H */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h ADDED Viewed

	@@ -0,0 +1,1665 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_KERNEL_H_)
+#define CURAND_KERNEL_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+#ifdef __CUDACC_RTC__
+#define CURAND_DETAIL_USE_CUDA_STL
+#endif
+#if __cplusplus >= 201103L
+#   ifdef CURAND_DETAIL_USE_CUDA_STL
+#       define CURAND_STD cuda::std
+#       include <cuda/std/type_traits>
+#   else
+#       define CURAND_STD std
+#       include <type_traits>
+#   endif // CURAND_DETAIL_USE_CUDA_STL
+#else
+// To support C++03 compilation
+#   define CURAND_STD curand_detail
+namespace curand_detail {
+    template<bool B, class T = void>
+    struct enable_if {};
+    template<class T>
+    struct enable_if<true, T> { typedef T type; };
+    template<class T, class U>
+    struct is_same { static const bool value = false; };
+    template<class T>
+    struct is_same<T, T> { static const bool value = true; };
+} // namespace curand_detail
+#endif // __cplusplus >= 201103L
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand.h"
+#include "curand_discrete.h"
+#include "curand_precalc.h"
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#include "curand_globals.h"
+/* Test RNG */
+/* This generator uses the formula:
+   x_n = x_(n-1) + 1 mod 2^32
+   x_0 = (unsigned int)seed * 3
+   Subsequences are spaced 31337 steps apart.
+*/
+struct curandStateTest {
+    unsigned int v;
+};
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateTest curandStateTest_t;
+/** \endcond */
+/* XORSHIFT FAMILY RNGs */
+/* These generators are a family proposed by Marsaglia.  They keep state
+   in 32 bit chunks, then use repeated shift and xor operations to scramble
+   the bits.  The following generators are a combination of a simple Weyl
+   generator with an N variable XORSHIFT generator.
+*/
+/* XORSHIFT RNG */
+/* This generator uses the xorwow formula of
+www.jstatsoft.org/v08/i14/paper page 5
+Has period 2^192 - 2^32.
+*/
+/**
+ * CURAND XORWOW state
+ */
+struct curandStateXORWOW;
+/*
+ * Implementation details not in reference documentation */
+struct curandStateXORWOW {
+    unsigned int d, v[5];
+    int boxmuller_flag;
+    int boxmuller_flag_double;
+    float boxmuller_extra;
+    double boxmuller_extra_double;
+};
+/*
+ * CURAND XORWOW state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateXORWOW curandStateXORWOW_t;
+#define EXTRA_FLAG_NORMAL         0x00000001
+#define EXTRA_FLAG_LOG_NORMAL     0x00000002
+/** \endcond */
+/* Combined Multiple Recursive Generators */
+/* These generators are a family proposed by L'Ecuyer.  They keep state
+   in sets of doubles, then use repeated modular arithmetic multiply operations
+   to scramble the bits in each set, and combine the result.
+*/
+/* MRG32k3a RNG */
+/* This generator uses the MRG32k3A formula of
+http://www.iro.umontreal.ca/~lecuyer/myftp/streams00/c++/streams4.pdf
+Has period 2^191.
+*/
+/* moduli for the recursions */
+/** \cond UNHIDE_DEFINES */
+#define MRG32K3A_MOD1 4294967087.
+#define MRG32K3A_MOD2 4294944443.
+/* Constants used in generation */
+#define MRG32K3A_A12  1403580.
+#define MRG32K3A_A13N 810728.
+#define MRG32K3A_A21  527612.
+#define MRG32K3A_A23N 1370589.
+#define MRG32K3A_NORM (2.3283065498378288e-10)
+//
+// #define MRG32K3A_BITS_NORM ((double)((POW32_DOUBLE-1.0)/MOD1))
+//  above constant, used verbatim, rounds differently on some host systems.
+#define MRG32K3A_BITS_NORM 1.000000048662
+/** \endcond */
+/**
+ * CURAND MRG32K3A state
+ */
+struct curandStateMRG32k3a;
+/* Implementation details not in reference documentation */
+struct curandStateMRG32k3a {
+    unsigned int s1[3];
+    unsigned int s2[3];
+    int boxmuller_flag;
+    int boxmuller_flag_double;
+    float boxmuller_extra;
+    double boxmuller_extra_double;
+};
+/*
+ * CURAND MRG32K3A state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateMRG32k3a curandStateMRG32k3a_t;
+/** \endcond */
+/* SOBOL QRNG */
+/**
+ * CURAND Sobol32 state
+ */
+struct curandStateSobol32;
+/* Implementation details not in reference documentation */
+struct curandStateSobol32 {
+    unsigned int i, x, c;
+    unsigned int direction_vectors[32];
+};
+/*
+ * CURAND Sobol32 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateSobol32 curandStateSobol32_t;
+/** \endcond */
+/**
+ * CURAND Scrambled Sobol32 state
+ */
+struct curandStateScrambledSobol32;
+/* Implementation details not in reference documentation */
+struct curandStateScrambledSobol32 {
+    unsigned int i, x, c;
+    unsigned int direction_vectors[32];
+};
+/*
+ * CURAND Scrambled Sobol32 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t;
+/** \endcond */
+/**
+ * CURAND Sobol64 state
+ */
+struct curandStateSobol64;
+/* Implementation details not in reference documentation */
+struct curandStateSobol64 {
+    unsigned long long i, x, c;
+    unsigned long long direction_vectors[64];
+};
+/*
+ * CURAND Sobol64 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateSobol64 curandStateSobol64_t;
+/** \endcond */
+/**
+ * CURAND Scrambled Sobol64 state
+ */
+struct curandStateScrambledSobol64;
+/* Implementation details not in reference documentation */
+struct curandStateScrambledSobol64 {
+    unsigned long long i, x, c;
+    unsigned long long direction_vectors[64];
+};
+/*
+ * CURAND Scrambled Sobol64 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t;
+/** \endcond */
+/*
+ * Default RNG
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateXORWOW curandState_t;
+typedef struct curandStateXORWOW curandState;
+/** \endcond */
+/****************************************************************************/
+/* Utility functions needed by RNGs */
+/****************************************************************************/
+/** \cond UNHIDE_UTILITIES */
+/*
+   multiply vector by matrix, store in result
+   matrix is n x n, measured in 32 bit units
+   matrix is stored in row major order
+   vector and result cannot be same pointer
+*/
+template<int N>
+QUALIFIERS void __curand_matvec_inplace(unsigned int *vector, unsigned int *matrix)
+{
+    unsigned int result[N] = { 0 };
+    for(int i = 0; i < N; i++) {
+        #ifdef __CUDA_ARCH__
+        #pragma unroll 16
+        #endif
+        for(int j = 0; j < 32; j++) {
+            if(vector[i] & (1 << j)) {
+                for(int k = 0; k < N; k++) {
+                    result[k] ^= matrix[N * (i * 32 + j) + k];
+                }
+            }
+        }
+    }
+    for(int i = 0; i < N; i++) {
+        vector[i] = result[i];
+    }
+}
+QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix,
+                                unsigned int *result, int n)
+{
+    for(int i = 0; i < n; i++) {
+        result[i] = 0;
+    }
+    for(int i = 0; i < n; i++) {
+        for(int j = 0; j < 32; j++) {
+            if(vector[i] & (1 << j)) {
+                for(int k = 0; k < n; k++) {
+                    result[k] ^= matrix[n * (i * 32 + j) + k];
+                }
+            }
+        }
+    }
+}
+/* generate identity matrix */
+QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n)
+{
+    int r;
+    for(int i = 0; i < n * 32; i++) {
+        for(int j = 0; j < n; j++) {
+            r = i & 31;
+            if(i / 32 == j) {
+                matrix[i * n + j] = (1 << r);
+            } else {
+                matrix[i * n + j] = 0;
+            }
+        }
+    }
+}
+/* multiply matrixA by matrixB, store back in matrixA
+   matrixA and matrixB must not be same matrix */
+QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n)
+{
+    unsigned int result[MAX_XOR_N];
+    for(int i = 0; i < n * 32; i++) {
+        __curand_matvec(matrixA + i * n, matrixB, result, n);
+        for(int j = 0; j < n; j++) {
+            matrixA[i * n + j] = result[j];
+        }
+    }
+}
+/* copy vectorA to vector */
+QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n)
+{
+    for(int i = 0; i < n; i++) {
+        vector[i] = vectorA[i];
+    }
+}
+/* copy matrixA to matrix */
+QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n)
+{
+    for(int i = 0; i < n * n * 32; i++) {
+        matrix[i] = matrixA[i];
+    }
+}
+/* compute matrixA to power p, store result in matrix */
+QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA,
+                                unsigned long long p, int n)
+{
+    unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32];
+    unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32];
+    __curand_matidentity(matrix, n);
+    __curand_matcopy(matrixR, matrixA, n);
+    while(p) {
+        if(p & 1) {
+            __curand_matmat(matrix, matrixR, n);
+        }
+        __curand_matcopy(matrixS, matrixR, n);
+        __curand_matmat(matrixR, matrixS, n);
+        p >>= 1;
+    }
+}
+/****************************************************************************/
+/* Utility functions needed by MRG32k3a RNG                                 */
+/* Matrix operations modulo some integer less than 2**32, done in           */
+/* double precision floating point, with care not to overflow 53 bits       */
+/****************************************************************************/
+/* return i mod m.                                                          */
+/* assumes i and m are integers represented accurately in doubles           */
+QUALIFIERS double curand_MRGmod(double i, double m)
+{
+    double quo;
+    double rem;
+    quo = floor(i/m);
+    rem = i - (quo*m);
+    if (rem < 0.0) rem += m;
+    return rem;
+}
+/* Multiplication modulo m. Inputs i and j less than 2**32                  */
+/* Ensure intermediate results do not exceed 2**53                          */
+QUALIFIERS double curand_MRGmodMul(double i, double j, double m)
+{
+    double tempHi;
+    double tempLo;
+    tempHi = floor(i/131072.0);
+    tempLo = i - (tempHi*131072.0);
+    tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m);
+    if (tempLo < 0.0) tempLo += m;
+    return tempLo;
+}
+/* multiply 3 by 3 matrices of doubles, modulo m                            */
+QUALIFIERS void curand_MRGmatMul3x3(unsigned int i1[][3],unsigned int i2[][3],unsigned int o[][3],double m)
+{
+    int i,j;
+    double temp[3][3];
+    for (i=0; i<3; i++){
+        for (j=0; j<3; j++){
+            temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) +
+                           curand_MRGmodMul(i1[i][1], i2[1][j], m) +
+                           curand_MRGmodMul(i1[i][2], i2[2][j], m));
+            temp[i][j] = curand_MRGmod( temp[i][j], m );
+        }
+    }
+    for (i=0; i<3; i++){
+        for (j=0; j<3; j++){
+            o[i][j] = (unsigned int)temp[i][j];
+        }
+    }
+}
+/* multiply 3 by 3 matrix times 3 by 1 vector of doubles, modulo m          */
+QUALIFIERS void curand_MRGmatVecMul3x3( unsigned int i[][3], unsigned int v[], double m)
+{
+    int k;
+    double t[3];
+    for (k = 0; k < 3; k++) {
+        t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) +
+                 curand_MRGmodMul(i[k][1], v[1], m) +
+                 curand_MRGmodMul(i[k][2], v[2], m) );
+        t[k] = curand_MRGmod( t[k], m );
+    }
+    for (k = 0; k < 3; k++) {
+        v[k] = (unsigned int)t[k];
+    }
+}
+/* raise a 3 by 3 matrix of doubles to a 64 bit integer power pow, modulo m */
+/* input is index zero of an array of 3 by 3 matrices m,                    */
+/* each m = m[0]**(2**index)                                                */
+QUALIFIERS void curand_MRGmatPow3x3( unsigned int in[][3][3], unsigned int o[][3], double m, unsigned long long pow )
+{
+    int i,j;
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            o[i][j] = 0;
+            if ( i == j ) o[i][j] = 1;
+        }
+    }
+    i = 0;
+    curand_MRGmatVecMul3x3(o,o[0],m);
+    while (pow) {
+        if ( pow & 1ll ) {
+             curand_MRGmatMul3x3(in[i], o, o, m);
+        }
+        i++;
+        pow >>= 1;
+    }
+}
+/* raise a 3 by 3 matrix of doubles to the power                            */
+/* 2 to the power (pow modulo 191), modulo m                                */
+QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow )
+{
+    unsigned int temp[3][3];
+    int i,j;
+    pow = pow % 191;
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            temp[i][j] = (unsigned int)in[i][j];
+        }
+    }
+    while (pow) {
+        curand_MRGmatMul3x3(temp, temp, temp, m);
+        pow--;
+    }
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            o[i][j] = temp[i][j];
+        }
+    }
+}
+/** \endcond */
+/****************************************************************************/
+/* Kernel implementations of RNGs                                           */
+/****************************************************************************/
+/* Test RNG */
+QUALIFIERS void curand_init(unsigned long long seed,
+                                            unsigned long long subsequence,
+                                            unsigned long long offset,
+                                            curandStateTest_t *state)
+{
+    state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \
+                     (unsigned int)offset;
+}
+QUALIFIERS unsigned int curand(curandStateTest_t *state)
+{
+    unsigned int r = state->v++;
+    return r;
+}
+QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state)
+{
+    state->v += (unsigned int)n;
+}
+/* XORWOW RNG */
+template <typename T, int n>
+QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[])
+{
+    T state;
+    // Generate matrix that advances one step
+    // matrix has n * n * 32 32-bit elements
+    // solve for matrix by stepping single bit states
+    for(int i = 0; i < 32 * n; i++) {
+        state.d = 0;
+        for(int j = 0; j < n; j++) {
+            state.v[j] = 0;
+        }
+        state.v[i / 32] = (1 << (i & 31));
+        curand(&state);
+        for(int j = 0; j < n; j++) {
+            matrix[i * n + j] = state.v[j];
+        }
+    }
+}
+template <typename T, int n>
+QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch)
+{
+    // unsigned int matrix[n * n * 32];
+    unsigned int *matrix = scratch;
+    // unsigned int matrixA[n * n * 32];
+    unsigned int *matrixA = scratch + (n * n * 32);
+    // unsigned int vector[n];
+    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
+    // unsigned int result[n];
+    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
+    unsigned long long p = x;
+    for(int i = 0; i < n; i++) {
+        vector[i] = state->v[i];
+    }
+    int matrix_num = 0;
+    while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+#ifdef __CUDA_ARCH__
+            __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n);
+#else
+            __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n);
+#endif
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    if(p) {
+#ifdef __CUDA_ARCH__
+        __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
+#else
+        __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+#endif
+    }
+    while(p) {
+        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
+            __curand_matvec(vector, matrixA, result, n);
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= SKIPAHEAD_BLOCKSIZE;
+        if(p) {
+            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
+                __curand_matmat(matrix, matrixA, n);
+                __curand_matcopy(matrixA, matrix, n);
+            }
+        }
+    }
+    for(int i = 0; i < n; i++) {
+        state->v[i] = vector[i];
+    }
+    state->d += 362437 * (unsigned int)x;
+}
+template <typename T, int n>
+QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch)
+{
+    // unsigned int matrix[n * n * 32];
+    unsigned int *matrix = scratch;
+    // unsigned int matrixA[n * n * 32];
+    unsigned int *matrixA = scratch + (n * n * 32);
+    // unsigned int vector[n];
+    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
+    // unsigned int result[n];
+    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
+    unsigned long long p = x;
+    for(int i = 0; i < n; i++) {
+        vector[i] = state->v[i];
+    }
+    int matrix_num = 0;
+    while(p && matrix_num < PRECALC_NUM_MATRICES - 1) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+#ifdef __CUDA_ARCH__
+            __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n);
+#else
+            __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n);
+#endif
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    if(p) {
+#ifdef __CUDA_ARCH__
+        __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
+#else
+        __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+#endif
+    }
+    while(p) {
+        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
+            __curand_matvec(vector, matrixA, result, n);
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= SKIPAHEAD_BLOCKSIZE;
+        if(p) {
+            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
+                __curand_matmat(matrix, matrixA, n);
+                __curand_matcopy(matrixA, matrix, n);
+            }
+        }
+    }
+    for(int i = 0; i < n; i++) {
+        state->v[i] = vector[i];
+    }
+    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
+}
+template <typename T, int N>
+QUALIFIERS void _skipahead_inplace(const unsigned long long x, T *state)
+{
+    unsigned long long p = x;
+    int matrix_num = 0;
+    while(p) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+#ifdef __CUDA_ARCH__
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix[matrix_num]);
+#else
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix_host[matrix_num]);
+#endif
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    state->d += 362437 * (unsigned int)x;
+}
+template <typename T, int N>
+QUALIFIERS void _skipahead_sequence_inplace(unsigned long long x, T *state)
+{
+    int matrix_num = 0;
+    while(x) {
+        for(unsigned int t = 0; t < (x & PRECALC_BLOCK_MASK); t++) {
+#ifdef __CUDA_ARCH__
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix[matrix_num]);
+#else
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix_host[matrix_num]);
+#endif
+        }
+        x >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
+}
+/**
+ * \brief Update XORWOW state to skip \p n elements.
+ *
+ * Update the XORWOW state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state)
+{
+    _skipahead_inplace<curandStateXORWOW_t, 5>(n, state);
+}
+/**
+ * \brief Update XORWOW state to skip ahead \p n subsequences.
+ *
+ * Update the XORWOW state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly  * n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state)
+{
+    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(n, state);
+}
+QUALIFIERS void _curand_init_scratch(unsigned long long seed,
+                                     unsigned long long subsequence,
+                                     unsigned long long offset,
+                                     curandStateXORWOW_t *state,
+                                     unsigned int *scratch)
+{
+    // Break up seed, apply salt
+    // Constants are arbitrary nonzero values
+    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
+    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
+    // Simple multiplication to mix up bits
+    // Constants are arbitrary odd values
+    unsigned int t0 = 1099087573UL * s0;
+    unsigned int t1 = 2591861531UL * s1;
+    state->d = 6615241 + t1 + t0;
+    state->v[0] = 123456789UL + t0;
+    state->v[1] = 362436069UL ^ t0;
+    state->v[2] = 521288629UL + t1;
+    state->v[3] = 88675123UL ^ t1;
+    state->v[4] = 5783321UL + t0;
+    _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch);
+    _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch);
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+QUALIFIERS void _curand_init_inplace(unsigned long long seed,
+                                     unsigned long long subsequence,
+                                     unsigned long long offset,
+                                     curandStateXORWOW_t *state)
+{
+    // Break up seed, apply salt
+    // Constants are arbitrary nonzero values
+    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
+    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
+    // Simple multiplication to mix up bits
+    // Constants are arbitrary odd values
+    unsigned int t0 = 1099087573UL * s0;
+    unsigned int t1 = 2591861531UL * s1;
+    state->d = 6615241 + t1 + t0;
+    state->v[0] = 123456789UL + t0;
+    state->v[1] = 362436069UL ^ t0;
+    state->v[2] = 521288629UL + t1;
+    state->v[3] = 88675123UL ^ t1;
+    state->v[4] = 5783321UL + t0;
+    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(subsequence, state);
+    _skipahead_inplace<curandStateXORWOW_t, 5>(offset, state);
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+/**
+ * \brief Initialize XORWOW state.
+ *
+ * Initialize XORWOW state in \p state with the given \p seed, \p subsequence,
+ * and \p offset.
+ *
+ * All input values of \p seed, \p subsequence, and \p offset are legal.  Large
+ * values for \p subsequence and \p offset require more computation and so will
+ * take more time to complete.
+ *
+ * A value of 0 for \p seed sets the state to the values of the original
+ * published version of the \p xorwow algorithm.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                            unsigned long long subsequence,
+                            unsigned long long offset,
+                            curandStateXORWOW_t *state)
+{
+    _curand_init_inplace(seed, subsequence, offset, state);
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from an XORWOW generator.
+ *
+ * Return 32-bits of pseudorandomness from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateXORWOW_t *state)
+{
+    unsigned int t;
+    t = (state->v[0] ^ (state->v[0] >> 2));
+    state->v[0] = state->v[1];
+    state->v[1] = state->v[2];
+    state->v[2] = state->v[3];
+    state->v[3] = state->v[4];
+    state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1));
+    state->d += 362437;
+    return state->v[4] + state->d;
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from an Philox4_32_10 generator.
+ *
+ * Return 32-bits of pseudorandomness from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
+{
+    // Maintain the invariant: output[STATE] is always "good" and
+    //  is the next value to be returned by curand.
+    unsigned int ret;
+    switch(state->STATE++){
+    default:
+        ret = state->output.x;
+        break;
+    case 1:
+        ret = state->output.y;
+        break;
+    case 2:
+        ret = state->output.z;
+        break;
+    case 3:
+        ret = state->output.w;
+        break;
+    }
+    if(state->STATE == 4){
+        Philox_State_Incr(state);
+        state->output = curand_Philox4x32_10(state->ctr,state->key);
+        state->STATE = 0;
+    }
+    return ret;
+}
+/**
+ * \brief Return tuple of 4 32-bit pseudorandoms from a Philox4_32_10 generator.
+ *
+ * Return 128 bits of pseudorandomness from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 128-bits of pseudorandomness as a uint4, all bits valid to use.
+ */
+QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state)
+{
+    uint4 r;
+    uint4 tmp = state->output;
+    Philox_State_Incr(state);
+    state->output= curand_Philox4x32_10(state->ctr,state->key);
+    switch(state->STATE){
+    case 0:
+        return tmp;
+    case 1:
+        r.x = tmp.y;
+        r.y = tmp.z;
+        r.z = tmp.w;
+        r.w = state->output.x;
+        break;
+    case 2:
+        r.x = tmp.z;
+        r.y = tmp.w;
+        r.z = state->output.x;
+        r.w = state->output.y;
+        break;
+    case 3:
+        r.x = tmp.w;
+        r.y = state->output.x;
+        r.z = state->output.y;
+        r.w = state->output.z;
+        break;
+    default:
+        // NOT possible but needed to avoid compiler warnings
+        return tmp;
+    }
+    return r;
+}
+/**
+ * \brief Update Philox4_32_10 state to skip \p n elements.
+ *
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state)
+{
+    state->STATE += (n & 3);
+    n /= 4;
+    if( state->STATE > 3 ){
+        n += 1;
+        state->STATE -= 4;
+    }
+    Philox_State_Incr(state, n);
+    state->output = curand_Philox4x32_10(state->ctr,state->key);
+}
+/**
+ * \brief Update Philox4_32_10 state to skip ahead \p n subsequences.
+ *
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly * n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state)
+{
+    Philox_State_Incr_hi(state, n);
+    state->output = curand_Philox4x32_10(state->ctr,state->key);
+}
+/**
+ * \brief Initialize Philox4_32_10 state.
+ *
+ * Initialize Philox4_32_10 state in \p state with the given \p seed, p\ subsequence,
+ * and \p offset.
+ *
+ * All input values for \p seed, \p subseqence and \p offset are legal.  Each of the
+ * \xmlonly<ph outputclass="xmlonly">2<sup>64</sup></ph>\endxmlonly possible
+ * values of seed selects an independent sequence of length
+ * \xmlonly<ph outputclass="xmlonly">2<sup>130</sup></ph>\endxmlonly.
+ * The first
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup> * subsequence + offset</ph>\endxmlonly.
+ * values of the sequence are skipped.
+ * I.e., subsequences are of length
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into subsequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                                 unsigned long long subsequence,
+                                 unsigned long long offset,
+                                 curandStatePhilox4_32_10_t *state)
+{
+    state->ctr = make_uint4(0, 0, 0, 0);
+    state->key.x = (unsigned int)seed;
+    state->key.y = (unsigned int)(seed>>32);
+    state->STATE = 0;
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+    skipahead_sequence(subsequence, state);
+    skipahead(offset, state);
+}
+/* MRG32k3a RNG */
+/* Base generator for MRG32k3a                                              */
+#if __CUDA_ARCH__ > 600
+QUALIFIERS unsigned long long __curand_umad(unsigned int a, unsigned int b, unsigned long long c)
+{
+    unsigned long long r;
+    asm("mad.wide.u32 %0, %1, %2, %3;"
+        : "=l"(r) : "r"(a), "r"(b), "l"(c));
+    return r;
+}
+QUALIFIERS unsigned long long __curand_umul(unsigned int a, unsigned int b)
+{
+    unsigned long long r;
+    asm("mul.wide.u32 %0, %1, %2;"
+        : "=l"(r) : "r"(a), "r"(b));
+    return r;
+}
+QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
+{
+    const unsigned int m1  = 4294967087u;
+    const unsigned int m2  = 4294944443u;
+    const unsigned int m1c = 209u;
+    const unsigned int m2c = 22853u;
+    const unsigned int a12  = 1403580u;
+    const unsigned int a13n = 810728u;
+    const unsigned int a21  = 527612u;
+    const unsigned int a23n = 1370589u;
+    unsigned long long p1, p2;
+    const unsigned long long p3 = __curand_umul(a13n, m1 - state->s1[0]);
+    p1 = __curand_umad(a12, state->s1[1], p3);
+    // Putting addition inside and changing umul to umad
+    // slowed this function down on GV100
+    p1 =  __curand_umul(p1 >> 32, m1c) + (p1 & 0xffffffff);
+    if (p1 >= m1) p1 -= m1;
+    state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1;
+    const unsigned long long p4 = __curand_umul(a23n, m2 - state->s2[0]);
+    p2 = __curand_umad(a21, state->s2[2], p4);
+    // Putting addition inside and changing umul to umad
+    // slowed this function down on GV100
+    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
+    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
+    if (p2 >= m2) p2 -= m2;
+    state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2;
+    const unsigned int p5 = (unsigned int)p1 - (unsigned int)p2;
+    if(p1 <= p2) return p5 + m1;
+    return p5;
+}
+#elif __CUDA_ARCH__ > 0
+/*  nj's implementation */
+QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
+{
+    const double m1 = 4294967087.;
+    const double m2 = 4294944443.;
+    const double a12  = 1403580.;
+    const double a13n = 810728.;
+    const double a21  = 527612.;
+    const double a23n = 1370589.;
+    const double rh1 =  2.3283065498378290e-010;  /* (1.0 / m1)__hi */
+    const double rl1 = -1.7354913086174288e-026;  /* (1.0 / m1)__lo */
+    const double rh2 =  2.3283188252407387e-010;  /* (1.0 / m2)__hi */
+    const double rl2 =  2.4081018096503646e-026;  /* (1.0 / m2)__lo */
+    double q, p1, p2;
+    p1 = a12 * state->s1[1] - a13n * state->s1[0];
+    q = trunc (fma (p1, rh1, p1 * rl1));
+    p1 -= q * m1;
+    if (p1 < 0.0) p1 += m1;
+    state->s1[0] = state->s1[1];   state->s1[1] = state->s1[2];   state->s1[2] = (unsigned int)p1;
+    p2 = a21 * state->s2[2] - a23n * state->s2[0];
+    q = trunc (fma (p2, rh2, p2 * rl2));
+    p2 -= q * m2;
+    if (p2 < 0.0) p2 += m2;
+    state->s2[0] = state->s2[1];   state->s2[1] = state->s2[2];   state->s2[2] = (unsigned int)p2;
+    if (p1 <= p2) return (p1 - p2 + m1);
+    else return (p1 - p2);
+}
+/* end nj's implementation */
+#else
+QUALIFIERS double curand_MRG32k3a(curandStateMRG32k3a_t *state)
+{
+    double p1,p2,r;
+    p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]);
+    p1 = curand_MRGmod(p1, MRG32K3A_MOD1);
+    if (p1 < 0.0) p1 += MRG32K3A_MOD1;
+    state->s1[0] = state->s1[1];
+    state->s1[1] = state->s1[2];
+    state->s1[2] = (unsigned int)p1;
+    p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]);
+    p2 = curand_MRGmod(p2, MRG32K3A_MOD2);
+    if (p2 < 0) p2 += MRG32K3A_MOD2;
+    state->s2[0] = state->s2[1];
+    state->s2[1] = state->s2[2];
+    state->s2[2] = (unsigned int)p2;
+    r = p1 - p2;
+    if (r <= 0) r += MRG32K3A_MOD1;
+    return r;
+}
+#endif
+/**
+ * \brief Return 32-bits of pseudorandomness from an MRG32k3a generator.
+ *
+ * Return 32-bits of pseudorandomness from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state)
+{
+    double dRet;
+    dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM;
+    return (unsigned int)dRet;
+}
+/**
+ * \brief Update MRG32k3a state to skip \p n elements.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+#ifdef __CUDA_ARCH__
+    curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#else
+    curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#endif
+}
+/**
+ * \brief Update MRG32k3a state to skip ahead \p n subsequences.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly
+ *
+ * \xmlonly<ph outputclass="xmlonly">2<sup>76</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
+ *
+ * Valid values of \p n are 0 to \xmlonly<ph outputclass="xmlonly">2<sup>51</sup></ph>\endxmlonly.  Note \p n will be masked to 51 bits
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+#ifdef __CUDA_ARCH__
+    curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#else
+    curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#endif
+}
+/**
+ * \brief Update MRG32k3a state to skip ahead \p n sequences.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n sequences.  Each
+ * sequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly * n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of sequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+#ifdef __CUDA_ARCH__
+    curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(  mrg32k3aM2Seq, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#else
+    curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(  mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+#endif
+}
+/**
+ * \brief Initialize MRG32k3a state.
+ *
+ * Initialize MRG32k3a state in \p state with the given \p seed, \p subsequence,
+ * and \p offset.
+ *
+ * All input values of \p seed, \p subsequence, and \p offset are legal.
+ * \p subsequence will be truncated to 51 bits to avoid running into the next sequence
+ *
+ * A value of 0 for \p seed sets the state to the values of the original
+ * published version of the \p MRG32k3a algorithm.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                            unsigned long long subsequence,
+                            unsigned long long offset,
+                            curandStateMRG32k3a_t *state)
+{
+    int i;
+    for ( i=0; i<3; i++ ) {
+        state->s1[i] = 12345u;
+        state->s2[i] = 12345u;
+    }
+    if (seed != 0ull) {
+        unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL;
+        unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL);
+        state->s1[0] = (unsigned int)curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1);
+        state->s1[1] = (unsigned int)curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1);
+        state->s1[2] = (unsigned int)curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1);
+        state->s2[0] = (unsigned int)curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2);
+        state->s2[1] = (unsigned int)curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2);
+        state->s2[2] = (unsigned int)curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2);
+    }
+    skipahead_subsequence( subsequence, state );
+    skipahead( offset, state );
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+/**
+ * \brief Update Sobol32 state to skip \p n elements.
+ *
+ * Update the Sobol32 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+template <typename T>
+QUALIFIERS
+typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol32_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol32_t*, T>::value>::type
+skipahead(unsigned int n, T state)
+{
+    unsigned int i_gray;
+    state->x = state->c;
+    state->i += n;
+    /* Convert state->i to gray code */
+    i_gray = state->i ^ (state->i >> 1);
+    for(unsigned int k = 0; k < 32; k++) {
+        if(i_gray & (1 << k)) {
+            state->x ^= state->direction_vectors[k];
+        }
+    }
+    return;
+}
+/**
+ * \brief Update Sobol64 state to skip \p n elements.
+ *
+ * Update the Sobol64 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+template <typename T>
+QUALIFIERS
+typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol64_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol64_t*, T>::value>::type
+skipahead(unsigned long long n, T state)
+{
+    unsigned long long i_gray;
+    state->x = state->c;
+    state->i += n;
+    /* Convert state->i to gray code */
+    i_gray = state->i ^ (state->i >> 1);
+    for(unsigned k = 0; k < 64; k++) {
+        if(i_gray & (1ULL << k)) {
+            state->x ^= state->direction_vectors[k];
+        }
+    }
+    return;
+}
+/**
+ * \brief Initialize Sobol32 state.
+ *
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
+ * direction vectors for the desired dimension
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
+                                            unsigned int offset,
+                                            curandStateSobol32_t *state)
+{
+    state->i = 0;
+    state->c = 0;
+    for(int i = 0; i < 32; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = 0;
+    skipahead<curandStateSobol32_t *>(offset, state);
+}
+/**
+ * \brief Initialize Scrambled Sobol32 state.
+ *
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
+ direction vectors for the desired dimension
+ * \param scramble_c Scramble constant
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
+                                            unsigned int scramble_c,
+                                            unsigned int offset,
+                                            curandStateScrambledSobol32_t *state)
+{
+    state->i = 0;
+    state->c = scramble_c;
+    for(int i = 0; i < 32; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = state->c;
+    skipahead<curandStateScrambledSobol32_t *>(offset, state);
+}
+QUALIFIERS int __curand_find_trailing_zero(unsigned int x)
+{
+#if __CUDA_ARCH__ > 0
+    int y = __ffs(~x);
+    if(y)
+        return y - 1;
+    return 31;
+#else
+    int i = 1;
+    while(x & 1) {
+        i++;
+        x >>= 1;
+    }
+    i = i - 1;
+    return i == 32 ? 31 : i;
+#endif
+}
+QUALIFIERS int __curand_find_trailing_zero(unsigned long long x)
+{
+#if __CUDA_ARCH__ > 0
+    int y = __ffsll(~x);
+    if(y)
+        return y - 1;
+    return 63;
+#else
+    int i = 1;
+    while(x & 1) {
+        i++;
+        x >>= 1;
+    }
+    i = i - 1;
+    return i == 64 ? 63 : i;
+#endif
+}
+/**
+ * \brief Initialize Sobol64 state.
+ *
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
+ direction vectors for the desired dimension
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
+                                            unsigned long long offset,
+                                            curandStateSobol64_t *state)
+{
+    state->i = 0;
+    state->c = 0;
+    for(int i = 0; i < 64; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = 0;
+    skipahead<curandStateSobol64_t *>(offset, state);
+}
+/**
+ * \brief Initialize Scrambled Sobol64 state.
+ *
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
+ direction vectors for the desired dimension
+ * \param scramble_c Scramble constant
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
+                                            unsigned long long scramble_c,
+                                            unsigned long long offset,
+                                            curandStateScrambledSobol64_t *state)
+{
+    state->i = 0;
+    state->c = scramble_c;
+    for(int i = 0; i < 64; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = state->c;
+    skipahead<curandStateScrambledSobol64_t *>(offset, state);
+}
+/**
+ * \brief Return 32-bits of quasirandomness from a Sobol32 generator.
+ *
+ * Return 32-bits of quasirandomness from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateSobol32_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned int res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 32-bits of quasirandomness from a scrambled Sobol32 generator.
+ *
+ * Return 32-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned int res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 64-bits of quasirandomness from a Sobol64 generator.
+ *
+ * Return 64-bits of quasirandomness from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
+ */
+QUALIFIERS unsigned long long curand(curandStateSobol64_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned long long res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 64-bits of quasirandomness from a scrambled Sobol64 generator.
+ *
+ * Return 64-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
+ */
+QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned long long res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+#include "curand_uniform.h"
+#include "curand_normal.h"
+#include "curand_lognormal.h"
+#include "curand_poisson.h"
+#include "curand_discrete2.h"
+__device__ static inline unsigned int *__get_precalculated_matrix(int n)
+{
+    if(n == 0) {
+        return precalc_xorwow_matrix[n];
+    }
+    if(n == 2) {
+        return precalc_xorwow_offset_matrix[n];
+    }
+    return precalc_xorwow_matrix[n];
+}
+#ifndef __CUDACC_RTC__
+__host__ static inline unsigned int *__get_precalculated_matrix_host(int n)
+{
+    if(n == 1) {
+        return precalc_xorwow_matrix_host[n];
+    }
+    if(n == 3) {
+        return precalc_xorwow_offset_matrix_host[n];
+    }
+    return precalc_xorwow_matrix_host[n];
+}
+#endif // #ifndef __CUDACC_RTC__
+__device__ static inline unsigned int *__get_mrg32k3a_matrix(int n)
+{
+    if(n == 0) {
+        return mrg32k3aM1[n][0];
+    }
+    if(n == 2) {
+        return mrg32k3aM2[n][0];
+    }
+    if(n == 4) {
+        return mrg32k3aM1SubSeq[n][0];
+    }
+    if(n == 6) {
+        return mrg32k3aM2SubSeq[n][0];
+    }
+    if(n == 8) {
+        return mrg32k3aM1Seq[n][0];
+    }
+    if(n == 10) {
+        return mrg32k3aM2Seq[n][0];
+    }
+    return mrg32k3aM1[n][0];
+}
+#ifndef __CUDACC_RTC__
+__host__ static inline unsigned int *__get_mrg32k3a_matrix_host(int n)
+{
+    if(n == 1) {
+        return mrg32k3aM1Host[n][0];
+    }
+    if(n == 3) {
+        return mrg32k3aM2Host[n][0];
+    }
+    if(n == 5) {
+        return mrg32k3aM1SubSeqHost[n][0];
+    }
+    if(n == 7) {
+        return mrg32k3aM2SubSeqHost[n][0];
+    }
+    if(n == 9) {
+        return mrg32k3aM1SeqHost[n][0];
+    }
+    if(n == 11) {
+        return mrg32k3aM2SeqHost[n][0];
+    }
+    return mrg32k3aM1Host[n][0];
+}
+__host__ static inline double *__get__cr_lgamma_table_host(void) {
+    return __cr_lgamma_table;
+}
+#endif // #ifndef __CUDACC_RTC__
+/** @} */
+#endif // !defined(CURAND_KERNEL_H_)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h ADDED Viewed

	@@ -0,0 +1,516 @@

+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * curand_mtgp32_host.h
+ *
+ *
+ * MTGP32-11213
+ *
+ * Mersenne Twister RNG for the GPU
+ *
+ * The period of generated integers is 2<sup>11213</sup>-1.
+ *
+ * This code generates 32-bit unsigned integers, and
+ * single precision floating point numbers uniformly distributed
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
+ */
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#if !defined CURAND_MTGP32_HOST_H
+#define CURAND_MTGP32_HOST_H
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static inline __device__
+#endif
+#include <cuda.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include "curand.h"
+#include "curand_mtgp32.h"
+#include "curand_mtgp32dc_p_11213.h"
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+static const unsigned int non_zero = 0x4d544750;
+/*
+ * This function represents a function used in the initialization
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
+ * @param[in] x 32-bit integer
+ * @return 32-bit integer
+ */
+static __forceinline__ unsigned int ini_func1(unsigned int x) {
+    return (x ^ (x >> 27)) * (1664525);
+}
+/*
+ * This function represents a function used in the initialization
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
+ * @param[in] x 32-bit integer
+ * @return 32-bit integer
+ */
+static __forceinline__ unsigned int ini_func2(unsigned int x) {
+    return (x ^ (x >> 27)) * (1566083941);
+}
+/*
+ * This function initializes the internal state array with a 32-bit
+ * integer seed. The allocated memory should be freed by calling
+ * mtgp32_free(). \b para should be one of the elements in the
+ * parameter table (mtgp32-param-ref.c).
+ *
+ * This function is call by cuda program, because cuda program uses
+ * another structure and another allocation method.
+ *
+ * @param[out] array MTGP internal status vector.
+ * @param[in] para parameter structure
+ * @param[in] seed a 32-bit integer used as the seed.
+ */
+static __forceinline__ __host__
+void mtgp32_init_state(unsigned int state[],
+                       const mtgp32_params_fast_t *para, unsigned int seed) {
+    int i;
+    int size = para->mexp / 32 + 1;
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = seed;
+    state[1] = hidden_seed;
+    for (i = 1; i < size; i++) {
+        state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
+    }
+}
+/*
+ * This function initializes the internal state array
+ * with a 32-bit integer array. \b para should be one of the elements in
+ * the parameter table (mtgp32-param-ref.c).
+ *
+ * @param[out] mtgp32 MTGP structure.
+ * @param[in] para parameter structure
+ * @param[in] array a 32-bit integer array used as a seed.
+ * @param[in] length length of the array.
+ * @return CURAND_STATUS_SUCCESS
+ */
+static __forceinline__ __host__
+int mtgp32_init_by_array(unsigned int state[],
+                         const mtgp32_params_fast_t *para,
+                         unsigned int *array, int length) {
+    int i, j, count;
+    unsigned int r;
+    int lag;
+    int mid;
+    int size = para->mexp / 32 + 1;
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    if (size >= 623) {
+    lag = 11;
+    } else if (size >= 68) {
+    lag = 7;
+    } else if (size >= 39) {
+    lag = 5;
+    } else {
+    lag = 3;
+    }
+    mid = (size - lag) / 2;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = hidden_seed;
+    if (length + 1 > size) {
+    count = length + 1;
+    } else {
+    count = size;
+    }
+    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
+    state[mid] += r;
+    r += length;
+    state[(mid + lag) % size] += r;
+    state[0] = r;
+    i = 1;
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < length); j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += array[j] + i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (; j < count; j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (j = 0; j < size; j++) {
+    r = ini_func2(state[i] + state[(i + mid) % size]
+              + state[(i + size - 1) % size]);
+    state[(i + mid) % size] ^= r;
+    r -= i;
+    state[(i + mid + lag) % size] ^= r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    if (state[size - 1] == 0) {
+    state[size - 1] = non_zero;
+    }
+    return 0;
+}
+/*
+ * This function initializes the internal state array
+ * with a character array. \b para should be one of the elements in
+ * the parameter table (mtgp32-param-ref.c).
+ * This is the same algorithm with mtgp32_init_by_array(), but hope to
+ * be more useful.
+ *
+ * @param[out] mtgp32 MTGP structure.
+ * @param[in] para parameter structure
+ * @param[in] array a character array used as a seed. (terminated by zero.)
+ * @return memory allocation result. if 0 then O.K.
+ */
+static __forceinline__ __host__
+int mtgp32_init_by_str(unsigned int state[],
+                       const mtgp32_params_fast_t *para, unsigned char *array) {
+    int i, j, count;
+    unsigned int r;
+    int lag;
+    int mid;
+    int size = para->mexp / 32 + 1;
+    int length = (unsigned int)strlen((char *)array);
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    if (size >= 623) {
+    lag = 11;
+    } else if (size >= 68) {
+    lag = 7;
+    } else if (size >= 39) {
+    lag = 5;
+    } else {
+    lag = 3;
+    }
+    mid = (size - lag) / 2;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = hidden_seed;
+    if (length + 1 > size) {
+    count = length + 1;
+    } else {
+    count = size;
+    }
+    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
+    state[mid] += r;
+    r += length;
+    state[(mid + lag) % size] += r;
+    state[0] = r;
+    i = 1;
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < length); j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += array[j] + i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (; j < count; j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (j = 0; j < size; j++) {
+    r = ini_func2(state[i] + state[(i + mid) % size]
+              + state[(i + size - 1) % size]);
+    state[(i + mid) % size] ^= r;
+    r -= i;
+    state[(i + mid + lag) % size] ^= r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    if (state[size - 1] == 0) {
+    state[size - 1] = non_zero;
+    }
+    return 0;
+}
+template<typename ParamsType>
+static __forceinline__ __host__
+curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num)
+{
+    const int size1 = sizeof(unsigned int) * block_num;
+    const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE;
+    unsigned int *h_pos_tbl;
+    unsigned int *h_sh1_tbl;
+    unsigned int *h_sh2_tbl;
+    unsigned int *h_param_tbl;
+    unsigned int *h_temper_tbl;
+    unsigned int *h_single_temper_tbl;
+    unsigned int *h_mask;
+    curandStatus_t status = CURAND_STATUS_SUCCESS;
+    h_pos_tbl = (unsigned int *)malloc(size1);
+    h_sh1_tbl = (unsigned int *)malloc(size1);
+    h_sh2_tbl = (unsigned int *)malloc(size1);
+    h_param_tbl = (unsigned int *)malloc(size2);
+    h_temper_tbl = (unsigned int *)malloc(size2);
+    h_single_temper_tbl = (unsigned int *)malloc(size2);
+    h_mask = (unsigned int *)malloc(sizeof(unsigned int));
+    if (h_pos_tbl == NULL
+	    || h_sh1_tbl == NULL
+	    || h_sh2_tbl == NULL
+	    || h_param_tbl == NULL
+	    || h_temper_tbl == NULL
+	    || h_single_temper_tbl == NULL
+	    || h_mask == NULL) {
+        if (h_pos_tbl != NULL) free(h_pos_tbl);
+        if (h_sh1_tbl != NULL) free(h_sh1_tbl);
+        if (h_sh2_tbl != NULL) free(h_sh2_tbl);
+        if (h_param_tbl != NULL) free(h_param_tbl);
+        if (h_temper_tbl != NULL) free(h_temper_tbl);
+        if (h_single_temper_tbl != NULL) free(h_single_temper_tbl);
+        if (h_mask != NULL) free(h_mask);
+        status = CURAND_STATUS_ALLOCATION_FAILED;
+    } else {
+        h_mask[0] = params[0].mask;
+        for (int i = 0; i < block_num; i++) {
+	        h_pos_tbl[i] = params[i].pos;
+	        h_sh1_tbl[i] = params[i].sh1;
+	        h_sh2_tbl[i] = params[i].sh2;
+	        for (int j = 0; j < TBL_SIZE; j++) {
+	            h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j];
+	            h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j];
+	            h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j];
+	        }
+        }
+        if (cudaMemcpy( p->pos_tbl,
+                        h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->sh1_tbl,
+                        h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->sh2_tbl,
+                        h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->param_tbl,
+                        h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->temper_tbl,
+                        h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->single_temper_tbl,
+                        h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->mask,
+                        h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        }
+    }
+    if (h_pos_tbl != NULL) free(h_pos_tbl);
+    if (h_sh1_tbl != NULL) free(h_sh1_tbl);
+    if (h_sh2_tbl != NULL) free(h_sh2_tbl);
+    if (h_param_tbl != NULL) free(h_param_tbl);
+    if (h_temper_tbl != NULL) free(h_temper_tbl);
+    if (h_single_temper_tbl != NULL)free(h_single_temper_tbl);
+    if (h_mask != NULL) free(h_mask);
+    return status;
+}
+/**
+ * \brief Set up constant parameters for the mtgp32 generator
+ *
+ * This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of
+ * generator parameters for use by kernel functions and copies the
+ * result to the specified location in device memory.
+ *
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
+ * \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory.
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
+ * - CURAND_STATUS_SUCCESS otherwise
+ */
+static __forceinline__ __host__
+curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p)
+{
+    return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS);
+}
+/**
+ * \brief Set up initial states for the mtgp32 generator
+ *
+ * This host-side helper function initializes a number of states (one parameter set per state) for
+ * an mtgp32 generator. To accomplish this it allocates a state array in host memory,
+ * initializes that array, and copies the result to device memory.
+ *
+ * \param s - pointer to an array of states in device memory
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
+ * \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory
+ * \param n - number of parameter sets/states to initialize
+ * \param seed - seed value
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
+ * - CURAND_STATUS_SUCCESS otherwise
+ */
+static __forceinline__ __host__
+curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s,
+                                                     mtgp32_params_fast_t params[],
+                                                     mtgp32_kernel_params_t *k,
+                                                     int n,
+                                                     unsigned long long seed)
+{
+    int i;
+    curandStatus_t status = CURAND_STATUS_SUCCESS;
+    curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n);
+    if (h_status == NULL) {
+        status = CURAND_STATUS_ALLOCATION_FAILED;
+    } else {
+        seed = seed ^ (seed >> 32);
+        for (i = 0; i < n; i++) {
+            mtgp32_init_state(&(h_status[i].s[0]), &params[i],(unsigned int)seed + i + 1);
+            h_status[i].offset = 0;
+            h_status[i].pIdx = i;
+            h_status[i].k = k;
+        }
+        if (cudaMemcpy(s, h_status,
+                       sizeof(curandStateMtgp32_t) * n,
+                       cudaMemcpyHostToDevice) != cudaSuccess) {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        }
+     }
+    free(h_status);
+    return status;
+}
+/** @} */
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h ADDED Viewed

	@@ -0,0 +1,751 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_POISSON_H_)
+#define CURAND_POISSON_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#define CR_CUDART_PI               3.1415926535897931e+0
+#define CR_CUDART_TWO_TO_52        4503599627370496.0
+QUALIFIERS float __cr_rsqrt(float a)
+{
+#ifdef __CUDA_ARCH__
+    asm ("rsqrt.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+#else
+    a = 1.0f / sqrtf (a);
+#endif
+    return a;
+}
+QUALIFIERS float __cr_exp (float a)
+{
+#ifdef __CUDA_ARCH__
+    a = a * 1.4426950408889634074;
+    asm ("ex2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+#else
+    a = expf (a);
+#endif
+    return a;
+}
+QUALIFIERS float __cr_log (float a)
+{
+#ifdef __CUDA_ARCH__
+    asm ("lg2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+    a = a * 0.69314718055994530942;
+#else
+    a = logf (a);
+#endif
+    return a;
+}
+QUALIFIERS float __cr_rcp (float a)
+{
+#ifdef __CUDA_ARCH__
+    asm ("rcp.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+#else
+    a = 1.0f / a;
+#endif
+    return a;
+}
+/* Computes regularized gamma function:  gammainc(a,x)/gamma(a) */
+QUALIFIERS float __cr_pgammainc (float a, float x)
+{
+    float t, alpha, beta;
+    /* First level parametrization constants */
+    float ma1 = 1.43248035075540910f,
+          ma2 = 0.12400979329415655f,
+          ma3 = 0.00025361074907033f,
+          mb1 = 0.21096734870196546f,
+          mb2 = 1.97381164089999420f,
+          mb3 = 0.94201734077887530f;
+    /* Second level parametrization constants (depends only on a) */
+    alpha = __cr_rsqrt (a - ma2);
+    alpha = ma1 * alpha + ma3;
+    beta = __cr_rsqrt (a - mb2);
+    beta = mb1 * beta + mb3;
+    /* Final approximation (depends on a and x) */
+    t = a - x;
+    t = alpha * t - beta;
+    t = 1.0f + __cr_exp (t);
+    t = t * t;
+    t = __cr_rcp (t);
+    /* Negative a,x or a,x=NAN requires special handling */
+    //t = !(x > 0 && a >= 0) ? 0.0 : t;
+    return t;
+}
+/* Computes inverse of pgammainc */
+QUALIFIERS float __cr_pgammaincinv (float a, float y)
+{
+    float t, alpha, beta;
+    /* First level parametrization constants */
+    float ma1 = 1.43248035075540910f,
+          ma2 = 0.12400979329415655f,
+          ma3 = 0.00025361074907033f,
+          mb1 = 0.21096734870196546f,
+          mb2 = 1.97381164089999420f,
+          mb3 = 0.94201734077887530f;
+    /* Second level parametrization constants (depends only on a) */
+    alpha = __cr_rsqrt (a - ma2);
+    alpha = ma1 * alpha + ma3;
+    beta = __cr_rsqrt (a - mb2);
+    beta = mb1 * beta + mb3;
+    /* Final approximation (depends on a and y) */
+    t = __cr_rsqrt (y) - 1.0f;
+    t = __cr_log (t);
+    t = beta + t;
+    t = - t * __cr_rcp (alpha) + a;
+    /* Negative a,x or a,x=NAN requires special handling */
+    //t = !(y > 0 && a >= 0) ? 0.0 : t;
+    return t;
+}
+#if defined(__CUDACC_RDC__) && (__cplusplus >= 201703L) && defined(__cpp_inline_variables)
+inline __constant__ double __cr_lgamma_table [] = {
+#else
+static __constant__ double __cr_lgamma_table [] = {
+#endif
+    0.000000000000000000e-1,
+    0.000000000000000000e-1,
+    6.931471805599453094e-1,
+    1.791759469228055001e0,
+    3.178053830347945620e0,
+    4.787491742782045994e0,
+    6.579251212010100995e0,
+    8.525161361065414300e0,
+    1.060460290274525023e1
+};
+QUALIFIERS double __cr_lgamma_integer(int a)
+{
+    double s;
+    double t;
+    double fa = fabs((float)a);
+    double sum;
+    if (a > 8) {
+        /* Stirling approximation; coefficients from Hart et al, "Computer
+         * Approximations", Wiley 1968. Approximation 5404.
+         */
+        s = 1.0 / fa;
+        t = s * s;
+        sum =          -0.1633436431e-2;
+        sum = sum * t + 0.83645878922e-3;
+        sum = sum * t - 0.5951896861197e-3;
+        sum = sum * t + 0.793650576493454e-3;
+        sum = sum * t - 0.277777777735865004e-2;
+        sum = sum * t + 0.833333333333331018375e-1;
+        sum = sum * s + 0.918938533204672;
+        s = 0.5 * log (fa);
+        t = fa - 0.5;
+        s = s * t;
+        t = s - fa;
+        s = s + sum;
+        t = t + s;
+        return t;
+    } else {
+#ifdef __CUDA_ARCH__
+        return __cr_lgamma_table [(int) fa-1];
+#else
+        switch(a) {
+            case 1: return 0.000000000000000000e-1;
+            case 2: return 0.000000000000000000e-1;
+            case 3: return 6.931471805599453094e-1;
+            case 4: return 1.791759469228055001e0;
+            case 5: return 3.178053830347945620e0;
+            case 6: return 4.787491742782045994e0;
+            case 7: return 6.579251212010100995e0;
+            case 8: return 8.525161361065414300e0;
+            default: return 1.060460290274525023e1;
+        }
+#endif
+    }
+}
+#define KNUTH_FLOAT_CONST 60.0
+template <typename T>
+// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
+QUALIFIERS unsigned int curand_poisson_knuth(T *state, float lambda)
+{
+  unsigned int k = 0;
+  float p = expf(lambda);
+  do{
+      k++;
+      p *= curand_uniform(state);
+  }while (p > 1.0);
+  return k-1;
+}
+template <typename T>
+// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
+QUALIFIERS uint4 curand_poisson_knuth4(T *state, float lambda)
+{
+  uint4 k = {0,0,0,0};
+  float exp_lambda = expf(lambda);
+  float4 p={ exp_lambda,exp_lambda,exp_lambda,exp_lambda };
+  do{
+      k.x++;
+      p.x *= curand_uniform(state);
+  }while (p.x > 1.0);
+  do{
+      k.y++;
+      p.y *= curand_uniform(state);
+  }while (p.y > 1.0);
+  do{
+      k.z++;
+      p.z *= curand_uniform(state);
+  }while (p.z > 1.0);
+  do{
+      k.w++;
+      p.w *= curand_uniform(state);
+  }while (p.w > 1.0);
+  k.x--;
+  k.y--;
+  k.z--;
+  k.w--;
+  return k;
+}
+template <typename T>
+// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
+QUALIFIERS unsigned int _curand_M2_double(T x, curandDistributionM2Shift_t distributionM2)
+{
+    double u = _curand_uniform_double(x);
+    int j = (int) floor(distributionM2->length*u);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    double histogramVj = __ldg( &(distributionM2->histogram->V[j]));
+    unsigned int histogramKj = __ldg( &(distributionM2->histogram->K[j]));
+#else
+    double histogramVj = distributionM2->histogram->V[j];
+    unsigned int histogramKj = distributionM2->histogram->K[j];
+#endif
+    //if (u < distributionM2->histogram->V[j]) return distributionM2->shift + j;
+    //return distributionM2->shift + distributionM2->histogram->K[j];
+    if (u < histogramVj) return distributionM2->shift + j;
+    return distributionM2->shift + histogramKj;
+}
+template <typename T>
+// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
+QUALIFIERS uint4 _curand_M2_double4(T x, curandDistributionM2Shift_t distributionM2)
+{
+    double4 u;
+    uint4 result = {0,0,0,0};
+    int4 flag = {1,1,1,1};
+    u.x = _curand_uniform_double(x.x);
+    u.y = _curand_uniform_double(x.y);
+    u.z = _curand_uniform_double(x.z);
+    u.w = _curand_uniform_double(x.w);
+    int4 j;
+    j.x = (int) floor(distributionM2->length*u.x);
+    j.y = (int) floor(distributionM2->length*u.y);
+    j.z = (int) floor(distributionM2->length*u.z);
+    j.w = (int) floor(distributionM2->length*u.w);
+//    int result;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    double histogramVjx =  __ldg( &(distributionM2->histogram->V[j.x]));
+    double histogramVjy =  __ldg( &(distributionM2->histogram->V[j.y]));
+    double histogramVjz =  __ldg( &(distributionM2->histogram->V[j.z]));
+    double histogramVjw =  __ldg( &(distributionM2->histogram->V[j.w]));
+    unsigned int histogramKjx = __ldg( &(distributionM2->histogram->K[j.x]));
+    unsigned int histogramKjy = __ldg( &(distributionM2->histogram->K[j.y]));
+    unsigned int histogramKjz = __ldg( &(distributionM2->histogram->K[j.z]));
+    unsigned int histogramKjw = __ldg( &(distributionM2->histogram->K[j.w]));
+#else
+    double histogramVjx =  distributionM2->histogram->V[j.x];
+    double histogramVjy =  distributionM2->histogram->V[j.y];
+    double histogramVjz =  distributionM2->histogram->V[j.z];
+    double histogramVjw =  distributionM2->histogram->V[j.w];
+    unsigned int histogramKjx = distributionM2->histogram->K[j.x];
+    unsigned int histogramKjy = distributionM2->histogram->K[j.y];
+    unsigned int histogramKjz = distributionM2->histogram->K[j.z];
+    unsigned int histogramKjw = distributionM2->histogram->K[j.w];
+#endif
+    if (u.x < histogramVjx){ result.x = distributionM2->shift + j.x; flag.x = 0; }
+    if (u.y < histogramVjy){ result.y = distributionM2->shift + j.y; flag.y = 0; }
+    if (u.z < histogramVjz){ result.z = distributionM2->shift + j.z; flag.z = 0; }
+    if (u.w < histogramVjw){ result.w = distributionM2->shift + j.w; flag.w = 0; }
+    //return distributionM2->shift + distributionM2->histogram->K[j];
+    if(flag.x) result.x = distributionM2->shift + histogramKjx;
+    if(flag.y) result.y = distributionM2->shift + histogramKjy;
+    if(flag.z) result.z = distributionM2->shift + histogramKjz;
+    if(flag.w) result.w = distributionM2->shift + histogramKjw;
+    return result;
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_M2_double(STATE *state, curandDistributionM2Shift_t distributionM2)
+{
+    return _curand_M2_double(curand(state), distributionM2);
+}
+template <typename STATE>
+QUALIFIERS uint4 curand_M2_double4(STATE *state, curandDistributionM2Shift_t distributionM2)
+{
+    return _curand_M2_double4(curand4(state), distributionM2);
+}
+template <typename T>
+QUALIFIERS unsigned int _curand_binary_search_double(T x, curandDistributionShift_t distribution)
+{
+    double u = _curand_uniform_double(x);
+    int min = 0;
+    int max = distribution->length-1;
+    do{
+        int mid = (max + min)/2;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+        double probability_mid = __ldg( &(distribution->probability[mid]));
+#else
+        double probability_mid = distribution->probability[mid];
+#endif
+        if (u <= probability_mid){
+            max = mid;
+        }else{
+            min = mid+1;
+        }
+    }while (min < max);
+    return distribution->shift + min;
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_binary_search_double(STATE *state, curandDistributionShift_t distribution)
+{
+    return _curand_binary_search_double(curand(state), distribution);
+}
+// Generates uniformly distributed double values in range (0.0; 1.0) from uniformly distributed
+// unsigned int. We can't use standard _curand_uniform_double since it can generate 1.0.
+// This is required only for _curand_poisson_ITR_double.
+QUALIFIERS double _curand_uniform_double_excluding_one(unsigned int x)
+{
+    return x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+}
+// Overload for unsigned long long.
+// This is required only for _curand_poisson_ITR_double.
+QUALIFIERS double _curand_uniform_double_excluding_one(unsigned long long x)
+{
+    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/4.0);
+}
+#define MAGIC_DOUBLE_CONST 500.0
+template <typename T>
+//George S. Fishman Discrete-event simulation: modeling, programming, and analysis
+QUALIFIERS unsigned int _curand_poisson_ITR_double(T x, double lambda)
+{
+  double L,p = 1.0;
+  double q = 1.0;
+  unsigned int k = 0;
+  int pow=0;
+  // This algorithm requires u to be in (0;1) range, however, _curand_uniform_double
+  // returns a number in range (0;1]. If u is 1.0 the inner loop never ends. The
+  // following operation transforms the range from (0;1] to (0;1).
+  double u = _curand_uniform_double_excluding_one(x);
+  do{
+      if (lambda > (double)(pow+MAGIC_DOUBLE_CONST)){
+          L = exp(-MAGIC_DOUBLE_CONST);
+      }else{
+          L = exp((double)(pow - lambda));
+      }
+      p *= L;
+      q *= L;
+      pow += (int) MAGIC_DOUBLE_CONST;
+      while (u > q){
+        k++;
+        p *= ((double)lambda / (double) k);
+        q += p;
+      }
+  }while((double)pow < lambda);
+  return k;
+}
+template <typename T>
+/* Rejection Method for Poisson distribution based on gammainc approximation */
+QUALIFIERS unsigned int curand_poisson_gammainc(T state, float lambda){
+    float y, x, t, z,v;
+    float logl = __cr_log (lambda);
+    while (true) {
+        y = curand_uniform (state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    return (unsigned int)x;
+}
+template <typename T>
+/* Rejection Method for Poisson distribution based on gammainc approximation */
+QUALIFIERS uint4 curand_poisson_gammainc4(T state, float lambda){
+    uint4 result;
+    float y, x, t, z,v;
+    float logl = __cr_log (lambda);
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.x = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.y = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.z = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.w = (unsigned int)x;
+    return result;
+}
+// Note below that the round to nearest integer, where needed,is done in line with code that
+// assumes the range of values is < 2**32
+template <typename T>
+QUALIFIERS unsigned int _curand_poisson(T x, double lambda)
+{
+    if (lambda < 1000)
+        return _curand_poisson_ITR_double(x, lambda);
+    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf_double(x)) + lambda + 0.5); //Round to nearest
+}
+template <typename T>
+QUALIFIERS unsigned int _curand_poisson_from_normal(T x, double lambda)
+{
+    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf(x)) + lambda + 0.5); //Round to nearest
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_poisson_from_normal(STATE state, double lambda)
+{
+    return (unsigned int)((sqrt(lambda) * curand_normal(state)) + lambda + 0.5); //Round to nearest
+}
+template <typename STATE>
+QUALIFIERS uint4 curand_poisson_from_normal4(STATE state, double lambda)
+{
+   uint4 result;
+   float4 _res;
+   _res = curand_normal4(state);
+   result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
+   result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
+   result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
+   result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
+   return result; //Round to nearest
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a XORWOW generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the XORWOW generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateXORWOW_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Philox4_32_10 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStatePhilox4_32_10_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return four Poisson-distributed unsigned ints from a Philox4_32_10 generator.
+ *
+ * Return a four unsigned ints from a Poisson
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS uint4 curand_poisson4(curandStatePhilox4_32_10_t *state, double lambda)
+{
+    uint4 result;
+    double4 _res;
+    if (lambda < 64)
+        return curand_poisson_knuth4(state, (float)lambda);
+    if (lambda > 4000) {
+        _res = curand_normal4_double(state);
+        result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
+        result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
+        result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
+        result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
+    	return result;
+    }
+    return curand_poisson_gammainc4(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a MRG32k3A generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the MRG32k3a generator in \p state,
+ * increment the position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateMRG32k3a_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a MTGP32 generator.
+ *
+ * Return a single int from a Poisson
+ * distribution with lambda \p lambda from the MTGP32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateMtgp32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Sobol32 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Sobol32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateSobol32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol32 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the scrambled Sobol32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Sobol64 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateSobol64_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol64 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol64_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+#endif // !defined(CURAND_POISSON_H_)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h ADDED Viewed

	@@ -0,0 +1,498 @@

+ /* Copyright 2010-2018 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_UNIFORM_H_)
+#define CURAND_UNIFORM_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+QUALIFIERS float _curand_uniform(unsigned int x)
+{
+    return x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+}
+QUALIFIERS float4 _curand_uniform4(uint4 x)
+{
+    float4 y;
+    y.x = x.x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.y = x.y * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.z = x.z * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.w = x.w * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    return y;
+}
+QUALIFIERS float _curand_uniform(unsigned long long x)
+{
+    unsigned int t;
+    t = (unsigned int)(x >> 32);
+    return t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+}
+QUALIFIERS double _curand_uniform_double(unsigned int x)
+{
+    return x * CURAND_2POW32_INV_DOUBLE + CURAND_2POW32_INV_DOUBLE;
+}
+QUALIFIERS double _curand_uniform_double(unsigned long long x)
+{
+    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+}
+QUALIFIERS double _curand_uniform_double_hq(unsigned int x, unsigned int y)
+{
+    unsigned long long z = (unsigned long long)x ^
+        ((unsigned long long)y << (53 - 32));
+    return z * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+}
+QUALIFIERS float curand_uniform(curandStateTest_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+QUALIFIERS double curand_uniform_double(curandStateTest_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from an XORWOW generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the XORWOW generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation may use any number of calls to \p curand() to
+ * get enough random bits to create the return value.  The current
+ * implementation uses one call.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateXORWOW_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from an XORWOW generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the XORWOW generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation may use any number of calls to \p curand() to
+ * get enough random bits to create the return value.  The current
+ * implementation uses exactly two calls.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateXORWOW_t *state)
+{
+    unsigned int x, y;
+    x = curand(state);
+    y = curand(state);
+    return _curand_uniform_double_hq(x, y);
+}
+/**
+ * \brief Return a uniformly distributed float from an MRG32k3a generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the MRG32k3a generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation returns up to 23 bits of mantissa, with the minimum
+ * return value \f$ 2^{-32} \f$
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateMRG32k3a_t *state)
+{
+    return ((float)(curand_MRG32k3a(state)*MRG32K3A_NORM));
+}
+/**
+ * \brief Return a uniformly distributed double from an MRG32k3a generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the MRG32k3a generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note the implementation returns at most 32 random bits of mantissa as
+ * outlined in the seminal paper by L'Ecuyer.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateMRG32k3a_t *state)
+{
+    return curand_MRG32k3a(state)*MRG32K3A_NORM;
+}
+/**
+ * \brief Return a uniformly distributed tuple of 2 doubles from an Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed 2 doubles (double4) between \p 0.0 and \p 1.0
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 2 uniformly distributed doubles between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double2 curand_uniform2_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    double2 result;
+    _x = curand4(state);
+    result.x = _curand_uniform_double_hq(_x.x,_x.y);
+    result.y = _curand_uniform_double_hq(_x.z,_x.w);
+    return result;
+}
+// not a part of API
+QUALIFIERS double4 curand_uniform4_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x, _y;
+    double4 result;
+    _x = curand4(state);
+    _y = curand4(state);
+    result.x = _curand_uniform_double_hq(_x.x,_x.y);
+    result.y = _curand_uniform_double_hq(_x.z,_x.w);
+    result.z = _curand_uniform_double_hq(_y.x,_y.y);
+    result.w = _curand_uniform_double_hq(_y.z,_y.w);
+    return result;
+}
+/**
+ * \brief Return a uniformly distributed float from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
+ *
+ */
+QUALIFIERS float curand_uniform(curandStatePhilox4_32_10_t *state)
+{
+   return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed tuple of 4 floats from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed 4 floats between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
+ *
+ */
+QUALIFIERS float4 curand_uniform4(curandStatePhilox4_32_10_t *state)
+{
+   return _curand_uniform4(curand4(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a MTGP32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the MTGP32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateMtgp32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a MTGP32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
+ * from the MTGP32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS double curand_uniform_double(curandStateMtgp32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \p curand_uniform2_double() is recommended for higher quality uniformly distributed
+ * double precision values.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS double curand_uniform_double(curandStatePhilox4_32_10_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a Sobol32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateSobol32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Sobol32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateSobol32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateScrambledSobol32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateScrambledSobol32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a Sobol64 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateSobol64_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Sobol64 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateSobol64_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateScrambledSobol64_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateScrambledSobol64_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+#endif // !defined(CURAND_UNIFORM_H_)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp.h ADDED Viewed

	@@ -0,0 +1,923 @@

+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSOLVERSP_H_)
+  #define CUSOLVERSP_H_
+  #include "cusparse.h"
+  #include "cublas_v2.h"
+  #include "cusolver_common.h"
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+  struct cusolverSpContext;
+  typedef struct cusolverSpContext *cusolverSpHandle_t;
+  struct csrqrInfo;
+  typedef struct csrqrInfo *csrqrInfo_t;
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpSetStream(cusolverSpHandle_t handle, cudaStream_t streamId);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpGetStream(cusolverSpHandle_t handle, cudaStream_t *streamId);
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrissymHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrEndPtrA,
+    const int *              csrColIndA,
+    int *                    issym);
+  /* -------- GPU linear solver by LU factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [lu] stands for LU factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+  /* -------- GPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+  /* -------- CPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+  /* -------- CPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+  /* -------- GPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    // output
+    float *x,
+    int *  singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    // output
+    double *x,
+    int *   singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    // output
+    cuComplex *x,
+    int *      singularity);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    // output
+    cuDoubleComplex *x,
+    int *            singularity);
+  /* ----------- CPU least square solver by QR factorization
+   *       solve min|b - A*x|
+   * [lsq] stands for least square
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int *                    rankA,
+    float *                  x,
+    int *                    p,
+    float *                  min_norm);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int *                    rankA,
+    double *                 x,
+    int *                    p,
+    double *                 min_norm);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int *                    rankA,
+    cuComplex *              x,
+    int *                    p,
+    float *                  min_norm);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int *                    rankA,
+    cuDoubleComplex *        x,
+    int *                    p,
+    double *                 min_norm);
+  /* --------- CPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    tol,
+    float *                  mu,
+    float *                  x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   tol,
+    double *                 mu,
+    double *                 x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    tol,
+    cuComplex *              mu,
+    cuComplex *              x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   tol,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+  /* --------- GPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    eps,
+    float *                  mu,
+    float *                  x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   eps,
+    double *                 mu,
+    double *                 x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    eps,
+    cuComplex *              mu,
+    cuComplex *              x);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   eps,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+  // ----------- enclosed eigenvalues
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+  /* --------- CPU symrcm
+   *   Symmetric reverse Cuthill McKee permutation
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymrcmHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+  /* --------- CPU symmdq
+   *   Symmetric minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymmdqHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+  /* --------- CPU symmdq
+   *   Symmetric Approximate minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymamdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+  /* --------- CPU metis
+   *   symmetric reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrmetisndHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int64_t *          options,
+    int *                    p);
+  /* --------- CPU zfd
+   *  Zero free diagonal reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+  /* --------- CPU permuation
+   *   P*A*Q^T
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrperm_bufferSizeHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int *              p,
+    const int *              q,
+    size_t *                 bufferSizeInBytes);
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrpermHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    int *                    csrRowPtrA,
+    int *                    csrColIndA,
+    const int *              p,
+    const int *              q,
+    int *                    map,
+    void *                   pBuffer);
+  /*
+   *  Low-level API: Batched QR
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrqrInfo(csrqrInfo_t *info);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrqrInfo(csrqrInfo_t info);
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info);
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float *                  x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double *                 x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    cuComplex *              x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    cuDoubleComplex *        x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+#endif // define CUSOLVERSP_H_

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/locators.cpython-311.pyc ADDED Viewed

Binary file (65.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/markers.cpython-311.pyc ADDED Viewed

Binary file (8.54 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/metadata.cpython-311.pyc ADDED Viewed

Binary file (47.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/resources.cpython-311.pyc ADDED Viewed

Binary file (19 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/scripts.cpython-311.pyc ADDED Viewed

Binary file (21.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (98.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/version.cpython-311.pyc ADDED Viewed

Binary file (34.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/__pycache__/wheel.cpython-311.pyc ADDED Viewed

Binary file (60.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distro/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.22 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distro/py.typed ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_emoji_codes.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_wrap.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from __future__ import annotations
+import re
+from typing import Iterable
+from ._loop import loop_last
+from .cells import cell_len, chop_cells
+re_word = re.compile(r"\s*\S+\s*")
+def words(text: str) -> Iterable[tuple[int, int, str]]:
+    """Yields each word from the text as a tuple
+    containing (start_index, end_index, word). A "word" in this context may
+    include the actual word and any whitespace to the right.
+    """
+    position = 0
+    word_match = re_word.match(text, position)
+    while word_match is not None:
+        start, end = word_match.span()
+        word = word_match.group(0)
+        yield start, end, word
+        word_match = re_word.match(text, end)
+def divide_line(text: str, width: int, fold: bool = True) -> list[int]:
+    """Given a string of text, and a width (measured in cells), return a list
+    of cell offsets which the string should be split at in order for it to fit
+    within the given width.
+    Args:
+        text: The text to examine.
+        width: The available cell width.
+        fold: If True, words longer than `width` will be folded onto a new line.
+    Returns:
+        A list of indices to break the line at.
+    """
+    break_positions: list[int] = []  # offsets to insert the breaks at
+    append = break_positions.append
+    cell_offset = 0
+    _cell_len = cell_len
+    for start, _end, word in words(text):
+        word_length = _cell_len(word.rstrip())
+        remaining_space = width - cell_offset
+        word_fits_remaining_space = remaining_space >= word_length
+        if word_fits_remaining_space:
+            # Simplest case - the word fits within the remaining width for this line.
+            cell_offset += _cell_len(word)
+        else:
+            # Not enough space remaining for this word on the current line.
+            if word_length > width:
+                # The word doesn't fit on any line, so we can't simply
+                # place it on the next line...
+                if fold:
+                    # Fold the word across multiple lines.
+                    folded_word = chop_cells(word, width=width)
+                    for last, line in loop_last(folded_word):
+                        if start:
+                            append(start)
+                        if last:
+                            cell_offset = _cell_len(line)
+                        else:
+                            start += len(line)
+                else:
+                    # Folding isn't allowed, so crop the word.
+                    if start:
+                        append(start)
+                    cell_offset = _cell_len(word)
+            elif cell_offset and start:
+                # The word doesn't fit within the remaining space on the current
+                # line, but it *can* fit on to the next (empty) line.
+                append(start)
+                cell_offset = _cell_len(word)
+    return break_positions
+if __name__ == "__main__":  # pragma: no cover
+    from .console import Console
+    console = Console(width=10)
+    console.print("12345 abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPQRSTUVWXYZ 12345")
+    print(chop_cells("abcdefghijklmnopqrstuvwxyz", 10))
+    console = Console(width=20)
+    console.rule()
+    console.print("TextualはPythonの高速アプリケーション開発フレームワークです")
+    console.rule()
+    console.print("アプリケーションは1670万色を使用でき")

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/constrain.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Optional, TYPE_CHECKING
+from .jupyter import JupyterMixin
+from .measure import Measurement
+if TYPE_CHECKING:
+    from .console import Console, ConsoleOptions, RenderableType, RenderResult
+class Constrain(JupyterMixin):
+    """Constrain the width of a renderable to a given number of characters.
+    Args:
+        renderable (RenderableType): A renderable object.
+        width (int, optional): The maximum width (in characters) to render. Defaults to 80.
+    """
+    def __init__(self, renderable: "RenderableType", width: Optional[int] = 80) -> None:
+        self.renderable = renderable
+        self.width = width
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        if self.width is None:
+            yield self.renderable
+        else:
+            child_options = options.update_width(min(self.width, options.max_width))
+            yield from console.render(self.renderable, child_options)
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "Measurement":
+        if self.width is not None:
+            options = options.update_width(self.width)
+        measurement = Measurement.get(console, options, self.renderable)
+        return measurement

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/file_proxy.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import io
+from typing import IO, TYPE_CHECKING, Any, List
+from .ansi import AnsiDecoder
+from .text import Text
+if TYPE_CHECKING:
+    from .console import Console
+class FileProxy(io.TextIOBase):
+    """Wraps a file (e.g. sys.stdout) and redirects writes to a console."""
+    def __init__(self, console: "Console", file: IO[str]) -> None:
+        self.__console = console
+        self.__file = file
+        self.__buffer: List[str] = []
+        self.__ansi_decoder = AnsiDecoder()
+    @property
+    def rich_proxied_file(self) -> IO[str]:
+        """Get proxied file."""
+        return self.__file
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.__file, name)
+    def write(self, text: str) -> int:
+        if not isinstance(text, str):
+            raise TypeError(f"write() argument must be str, not {type(text).__name__}")
+        buffer = self.__buffer
+        lines: List[str] = []
+        while text:
+            line, new_line, text = text.partition("\n")
+            if new_line:
+                lines.append("".join(buffer) + line)
+                buffer.clear()
+            else:
+                buffer.append(line)
+                break
+        if lines:
+            console = self.__console
+            with console:
+                output = Text("\n").join(
+                    self.__ansi_decoder.decode_line(line) for line in lines
+                )
+                console.print(output)
+        return len(text)
+    def flush(self) -> None:
+        output = "".join(self.__buffer)
+        if output:
+            self.__console.print(output)
+        del self.__buffer[:]
+    def fileno(self) -> int:
+        return self.__file.fileno()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/highlighter.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import re
+from abc import ABC, abstractmethod
+from typing import List, Union
+from .text import Span, Text
+def _combine_regex(*regexes: str) -> str:
+    """Combine a number of regexes in to a single regex.
+    Returns:
+        str: New regex with all regexes ORed together.
+    """
+    return "|".join(regexes)
+class Highlighter(ABC):
+    """Abstract base class for highlighters."""
+    def __call__(self, text: Union[str, Text]) -> Text:
+        """Highlight a str or Text instance.
+        Args:
+            text (Union[str, ~Text]): Text to highlight.
+        Raises:
+            TypeError: If not called with text or str.
+        Returns:
+            Text: A test instance with highlighting applied.
+        """
+        if isinstance(text, str):
+            highlight_text = Text(text)
+        elif isinstance(text, Text):
+            highlight_text = text.copy()
+        else:
+            raise TypeError(f"str or Text instance required, not {text!r}")
+        self.highlight(highlight_text)
+        return highlight_text
+    @abstractmethod
+    def highlight(self, text: Text) -> None:
+        """Apply highlighting in place to text.
+        Args:
+            text (~Text): A text object highlight.
+        """
+class NullHighlighter(Highlighter):
+    """A highlighter object that doesn't highlight.
+    May be used to disable highlighting entirely.
+    """
+    def highlight(self, text: Text) -> None:
+        """Nothing to do"""
+class RegexHighlighter(Highlighter):
+    """Applies highlighting from a list of regular expressions."""
+    highlights: List[str] = []
+    base_style: str = ""
+    def highlight(self, text: Text) -> None:
+        """Highlight :class:`rich.text.Text` using regular expressions.
+        Args:
+            text (~Text): Text to highlighted.
+        """
+        highlight_regex = text.highlight_regex
+        for re_highlight in self.highlights:
+            highlight_regex(re_highlight, style_prefix=self.base_style)
+class ReprHighlighter(RegexHighlighter):
+    """Highlights the text typically produced from ``__repr__`` methods."""
+    base_style = "repr."
+    highlights = [
+        r"(?P<tag_start><)(?P<tag_name>[-\w.:|]*)(?P<tag_contents>[\w\W]*)(?P<tag_end>>)",
+        r'(?P<attrib_name>[\w_]{1,50})=(?P<attrib_value>"?[\w_]+"?)?',
+        r"(?P<brace>[][{}()])",
+        _combine_regex(
+            r"(?P<ipv4>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})",
+            r"(?P<ipv6>([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})",
+            r"(?P<eui64>(?:[0-9A-Fa-f]{1,2}-){7}[0-9A-Fa-f]{1,2}|(?:[0-9A-Fa-f]{1,2}:){7}[0-9A-Fa-f]{1,2}|(?:[0-9A-Fa-f]{4}\.){3}[0-9A-Fa-f]{4})",
+            r"(?P<eui48>(?:[0-9A-Fa-f]{1,2}-){5}[0-9A-Fa-f]{1,2}|(?:[0-9A-Fa-f]{1,2}:){5}[0-9A-Fa-f]{1,2}|(?:[0-9A-Fa-f]{4}\.){2}[0-9A-Fa-f]{4})",
+            r"(?P<uuid>[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})",
+            r"(?P<call>[\w.]*?)\(",
+            r"\b(?P<bool_true>True)\b|\b(?P<bool_false>False)\b|\b(?P<none>None)\b",
+            r"(?P<ellipsis>\.\.\.)",
+            r"(?P<number_complex>(?<!\w)(?:\-?[0-9]+\.?[0-9]*(?:e[-+]?\d+?)?)(?:[-+](?:[0-9]+\.?[0-9]*(?:e[-+]?\d+)?))?j)",
+            r"(?P<number>(?<!\w)\-?[0-9]+\.?[0-9]*(e[-+]?\d+?)?\b|0x[0-9a-fA-F]*)",
+            r"(?P<path>\B(/[-\w._+]+)*\/)(?P<filename>[-\w._+]*)?",
+            r"(?<![\\\w])(?P<str>b?'''.*?(?<!\\)'''|b?'.*?(?<!\\)'|b?\"\"\".*?(?<!\\)\"\"\"|b?\".*?(?<!\\)\")",
+            r"(?P<url>(file|https|http|ws|wss)://[-0-9a-zA-Z$_+!`(),.?/;:&=%#~]*)",
+        ),
+    ]
+class JSONHighlighter(RegexHighlighter):
+    """Highlights JSON"""
+    # Captures the start and end of JSON strings, handling escaped quotes
+    JSON_STR = r"(?<![\\\w])(?P<str>b?\".*?(?<!\\)\")"
+    JSON_WHITESPACE = {" ", "\n", "\r", "\t"}
+    base_style = "json."
+    highlights = [
+        _combine_regex(
+            r"(?P<brace>[\{\[\(\)\]\}])",
+            r"\b(?P<bool_true>true)\b|\b(?P<bool_false>false)\b|\b(?P<null>null)\b",
+            r"(?P<number>(?<!\w)\-?[0-9]+\.?[0-9]*(e[\-\+]?\d+?)?\b|0x[0-9a-fA-F]*)",
+            JSON_STR,
+        ),
+    ]
+    def highlight(self, text: Text) -> None:
+        super().highlight(text)
+        # Additional work to handle highlighting JSON keys
+        plain = text.plain
+        append = text.spans.append
+        whitespace = self.JSON_WHITESPACE
+        for match in re.finditer(self.JSON_STR, plain):
+            start, end = match.span()
+            cursor = end
+            while cursor < len(plain):
+                char = plain[cursor]
+                cursor += 1
+                if char == ":":
+                    append(Span(start, end, "json.key"))
+                elif char in whitespace:
+                    continue
+                break
+class ISO8601Highlighter(RegexHighlighter):
+    """Highlights the ISO8601 date time strings.
+    Regex reference: https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s07.html
+    """
+    base_style = "iso8601."
+    highlights = [
+        #
+        # Dates
+        #
+        # Calendar month (e.g. 2008-08). The hyphen is required
+        r"^(?P<year>[0-9]{4})-(?P<month>1[0-2]|0[1-9])$",
+        # Calendar date w/o hyphens (e.g. 20080830)
+        r"^(?P<date>(?P<year>[0-9]{4})(?P<month>1[0-2]|0[1-9])(?P<day>3[01]|0[1-9]|[12][0-9]))$",
+        # Ordinal date (e.g. 2008-243). The hyphen is optional
+        r"^(?P<date>(?P<year>[0-9]{4})-?(?P<day>36[0-6]|3[0-5][0-9]|[12][0-9]{2}|0[1-9][0-9]|00[1-9]))$",
+        #
+        # Weeks
+        #
+        # Week of the year (e.g., 2008-W35). The hyphen is optional
+        r"^(?P<date>(?P<year>[0-9]{4})-?W(?P<week>5[0-3]|[1-4][0-9]|0[1-9]))$",
+        # Week date (e.g., 2008-W35-6). The hyphens are optional
+        r"^(?P<date>(?P<year>[0-9]{4})-?W(?P<week>5[0-3]|[1-4][0-9]|0[1-9])-?(?P<day>[1-7]))$",
+        #
+        # Times
+        #
+        # Hours and minutes (e.g., 17:21). The colon is optional
+        r"^(?P<time>(?P<hour>2[0-3]|[01][0-9]):?(?P<minute>[0-5][0-9]))$",
+        # Hours, minutes, and seconds w/o colons (e.g., 172159)
+        r"^(?P<time>(?P<hour>2[0-3]|[01][0-9])(?P<minute>[0-5][0-9])(?P<second>[0-5][0-9]))$",
+        # Time zone designator (e.g., Z, +07 or +07:00). The colons and the minutes are optional
+        r"^(?P<timezone>(Z|[+-](?:2[0-3]|[01][0-9])(?::?(?:[0-5][0-9]))?))$",
+        # Hours, minutes, and seconds with time zone designator (e.g., 17:21:59+07:00).
+        # All the colons are optional. The minutes in the time zone designator are also optional
+        r"^(?P<time>(?P<hour>2[0-3]|[01][0-9])(?P<minute>[0-5][0-9])(?P<second>[0-5][0-9]))(?P<timezone>Z|[+-](?:2[0-3]|[01][0-9])(?::?(?:[0-5][0-9]))?)$",
+        #
+        # Date and Time
+        #
+        # Calendar date with hours, minutes, and seconds (e.g., 2008-08-30 17:21:59 or 20080830 172159).
+        # A space is required between the date and the time. The hyphens and colons are optional.
+        # This regex matches dates and times that specify some hyphens or colons but omit others.
+        # This does not follow ISO 8601
+        r"^(?P<date>(?P<year>[0-9]{4})(?P<hyphen>-)?(?P<month>1[0-2]|0[1-9])(?(hyphen)-)(?P<day>3[01]|0[1-9]|[12][0-9])) (?P<time>(?P<hour>2[0-3]|[01][0-9])(?(hyphen):)(?P<minute>[0-5][0-9])(?(hyphen):)(?P<second>[0-5][0-9]))$",
+        #
+        # XML Schema dates and times
+        #
+        # Date, with optional time zone (e.g., 2008-08-30 or 2008-08-30+07:00).
+        # Hyphens are required. This is the XML Schema 'date' type
+        r"^(?P<date>(?P<year>-?(?:[1-9][0-9]*)?[0-9]{4})-(?P<month>1[0-2]|0[1-9])-(?P<day>3[01]|0[1-9]|[12][0-9]))(?P<timezone>Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$",
+        # Time, with optional fractional seconds and time zone (e.g., 01:45:36 or 01:45:36.123+07:00).
+        # There is no limit on the number of digits for the fractional seconds. This is the XML Schema 'time' type
+        r"^(?P<time>(?P<hour>2[0-3]|[01][0-9]):(?P<minute>[0-5][0-9]):(?P<second>[0-5][0-9])(?P<frac>\.[0-9]+)?)(?P<timezone>Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$",
+        # Date and time, with optional fractional seconds and time zone (e.g., 2008-08-30T01:45:36 or 2008-08-30T01:45:36.123Z).
+        # This is the XML Schema 'dateTime' type
+        r"^(?P<date>(?P<year>-?(?:[1-9][0-9]*)?[0-9]{4})-(?P<month>1[0-2]|0[1-9])-(?P<day>3[01]|0[1-9]|[12][0-9]))T(?P<time>(?P<hour>2[0-3]|[01][0-9]):(?P<minute>[0-5][0-9]):(?P<second>[0-5][0-9])(?P<ms>\.[0-9]+)?)(?P<timezone>Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$",
+    ]
+if __name__ == "__main__":  # pragma: no cover
+    from .console import Console
+    console = Console()
+    console.print("[bold green]hello world![/bold green]")
+    console.print("'[bold green]hello world![/bold green]'")
+    console.print(" /foo")
+    console.print("/foo/")
+    console.print("/foo/bar")
+    console.print("foo/bar/baz")
+    console.print("/foo/bar/baz?foo=bar+egg&egg=baz")
+    console.print("/foo/bar/baz/")
+    console.print("/foo/bar/baz/egg")
+    console.print("/foo/bar/baz/egg.py")
+    console.print("/foo/bar/baz/egg.py word")
+    console.print(" /foo/bar/baz/egg.py word")
+    console.print("foo /foo/bar/baz/egg.py word")
+    console.print("foo /foo/bar/ba._++z/egg+.py word")
+    console.print("https://example.org?foo=bar#header")
+    console.print(1234567.34)
+    console.print(1 / 2)
+    console.print(-1 / 123123123123)
+    console.print(
+        "127.0.1.1 bar 192.168.1.4 2001:0db8:85a3:0000:0000:8a2e:0370:7334 foo"
+    )
+    import json
+    console.print_json(json.dumps(obj={"name": "apple", "count": 1}), indent=None)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/json.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from pathlib import Path
+from json import loads, dumps
+from typing import Any, Callable, Optional, Union
+from .text import Text
+from .highlighter import JSONHighlighter, NullHighlighter
+class JSON:
+    """A renderable which pretty prints JSON.
+    Args:
+        json (str): JSON encoded data.
+        indent (Union[None, int, str], optional): Number of characters to indent by. Defaults to 2.
+        highlight (bool, optional): Enable highlighting. Defaults to True.
+        skip_keys (bool, optional): Skip keys not of a basic type. Defaults to False.
+        ensure_ascii (bool, optional): Escape all non-ascii characters. Defaults to False.
+        check_circular (bool, optional): Check for circular references. Defaults to True.
+        allow_nan (bool, optional): Allow NaN and Infinity values. Defaults to True.
+        default (Callable, optional): A callable that converts values that can not be encoded
+            in to something that can be JSON encoded. Defaults to None.
+        sort_keys (bool, optional): Sort dictionary keys. Defaults to False.
+    """
+    def __init__(
+        self,
+        json: str,
+        indent: Union[None, int, str] = 2,
+        highlight: bool = True,
+        skip_keys: bool = False,
+        ensure_ascii: bool = False,
+        check_circular: bool = True,
+        allow_nan: bool = True,
+        default: Optional[Callable[[Any], Any]] = None,
+        sort_keys: bool = False,
+    ) -> None:
+        data = loads(json)
+        json = dumps(
+            data,
+            indent=indent,
+            skipkeys=skip_keys,
+            ensure_ascii=ensure_ascii,
+            check_circular=check_circular,
+            allow_nan=allow_nan,
+            default=default,
+            sort_keys=sort_keys,
+        )
+        highlighter = JSONHighlighter() if highlight else NullHighlighter()
+        self.text = highlighter(json)
+        self.text.no_wrap = True
+        self.text.overflow = None
+    @classmethod
+    def from_data(
+        cls,
+        data: Any,
+        indent: Union[None, int, str] = 2,
+        highlight: bool = True,
+        skip_keys: bool = False,
+        ensure_ascii: bool = False,
+        check_circular: bool = True,
+        allow_nan: bool = True,
+        default: Optional[Callable[[Any], Any]] = None,
+        sort_keys: bool = False,
+    ) -> "JSON":
+        """Encodes a JSON object from arbitrary data.
+        Args:
+            data (Any): An object that may be encoded in to JSON
+            indent (Union[None, int, str], optional): Number of characters to indent by. Defaults to 2.
+            highlight (bool, optional): Enable highlighting. Defaults to True.
+            default (Callable, optional): Optional callable which will be called for objects that cannot be serialized. Defaults to None.
+            skip_keys (bool, optional): Skip keys not of a basic type. Defaults to False.
+            ensure_ascii (bool, optional): Escape all non-ascii characters. Defaults to False.
+            check_circular (bool, optional): Check for circular references. Defaults to True.
+            allow_nan (bool, optional): Allow NaN and Infinity values. Defaults to True.
+            default (Callable, optional): A callable that converts values that can not be encoded
+                in to something that can be JSON encoded. Defaults to None.
+            sort_keys (bool, optional): Sort dictionary keys. Defaults to False.
+        Returns:
+            JSON: New JSON object from the given data.
+        """
+        json_instance: "JSON" = cls.__new__(cls)
+        json = dumps(
+            data,
+            indent=indent,
+            skipkeys=skip_keys,
+            ensure_ascii=ensure_ascii,
+            check_circular=check_circular,
+            allow_nan=allow_nan,
+            default=default,
+            sort_keys=sort_keys,
+        )
+        highlighter = JSONHighlighter() if highlight else NullHighlighter()
+        json_instance.text = highlighter(json)
+        json_instance.text.no_wrap = True
+        json_instance.text.overflow = None
+        return json_instance
+    def __rich__(self) -> Text:
+        return self.text
+if __name__ == "__main__":
+    import argparse
+    import sys
+    parser = argparse.ArgumentParser(description="Pretty print json")
+    parser.add_argument(
+        "path",
+        metavar="PATH",
+        help="path to file, or - for stdin",
+    )
+    parser.add_argument(
+        "-i",
+        "--indent",
+        metavar="SPACES",
+        type=int,
+        help="Number of spaces in an indent",
+        default=2,
+    )
+    args = parser.parse_args()
+    from pip._vendor.rich.console import Console
+    console = Console()
+    error_console = Console(stderr=True)
+    try:
+        if args.path == "-":
+            json_data = sys.stdin.read()
+        else:
+            json_data = Path(args.path).read_text()
+    except Exception as error:
+        error_console.print(f"Unable to read {args.path!r}; {error}")
+        sys.exit(-1)
+    console.print(JSON(json_data, indent=args.indent), soft_wrap=True)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/layout.py ADDED Viewed

	@@ -0,0 +1,442 @@

+from abc import ABC, abstractmethod
+from itertools import islice
+from operator import itemgetter
+from threading import RLock
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+from ._ratio import ratio_resolve
+from .align import Align
+from .console import Console, ConsoleOptions, RenderableType, RenderResult
+from .highlighter import ReprHighlighter
+from .panel import Panel
+from .pretty import Pretty
+from .region import Region
+from .repr import Result, rich_repr
+from .segment import Segment
+from .style import StyleType
+if TYPE_CHECKING:
+    from pip._vendor.rich.tree import Tree
+class LayoutRender(NamedTuple):
+    """An individual layout render."""
+    region: Region
+    render: List[List[Segment]]
+RegionMap = Dict["Layout", Region]
+RenderMap = Dict["Layout", LayoutRender]
+class LayoutError(Exception):
+    """Layout related error."""
+class NoSplitter(LayoutError):
+    """Requested splitter does not exist."""
+class _Placeholder:
+    """An internal renderable used as a Layout placeholder."""
+    highlighter = ReprHighlighter()
+    def __init__(self, layout: "Layout", style: StyleType = "") -> None:
+        self.layout = layout
+        self.style = style
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        width = options.max_width
+        height = options.height or options.size.height
+        layout = self.layout
+        title = (
+            f"{layout.name!r} ({width} x {height})"
+            if layout.name
+            else f"({width} x {height})"
+        )
+        yield Panel(
+            Align.center(Pretty(layout), vertical="middle"),
+            style=self.style,
+            title=self.highlighter(title),
+            border_style="blue",
+            height=height,
+        )
+class Splitter(ABC):
+    """Base class for a splitter."""
+    name: str = ""
+    @abstractmethod
+    def get_tree_icon(self) -> str:
+        """Get the icon (emoji) used in layout.tree"""
+    @abstractmethod
+    def divide(
+        self, children: Sequence["Layout"], region: Region
+    ) -> Iterable[Tuple["Layout", Region]]:
+        """Divide a region amongst several child layouts.
+        Args:
+            children (Sequence(Layout)): A number of child layouts.
+            region (Region): A rectangular region to divide.
+        """
+class RowSplitter(Splitter):
+    """Split a layout region in to rows."""
+    name = "row"
+    def get_tree_icon(self) -> str:
+        return "[layout.tree.row]⬌"
+    def divide(
+        self, children: Sequence["Layout"], region: Region
+    ) -> Iterable[Tuple["Layout", Region]]:
+        x, y, width, height = region
+        render_widths = ratio_resolve(width, children)
+        offset = 0
+        _Region = Region
+        for child, child_width in zip(children, render_widths):
+            yield child, _Region(x + offset, y, child_width, height)
+            offset += child_width
+class ColumnSplitter(Splitter):
+    """Split a layout region in to columns."""
+    name = "column"
+    def get_tree_icon(self) -> str:
+        return "[layout.tree.column]⬍"
+    def divide(
+        self, children: Sequence["Layout"], region: Region
+    ) -> Iterable[Tuple["Layout", Region]]:
+        x, y, width, height = region
+        render_heights = ratio_resolve(height, children)
+        offset = 0
+        _Region = Region
+        for child, child_height in zip(children, render_heights):
+            yield child, _Region(x, y + offset, width, child_height)
+            offset += child_height
+@rich_repr
+class Layout:
+    """A renderable to divide a fixed height in to rows or columns.
+    Args:
+        renderable (RenderableType, optional): Renderable content, or None for placeholder. Defaults to None.
+        name (str, optional): Optional identifier for Layout. Defaults to None.
+        size (int, optional): Optional fixed size of layout. Defaults to None.
+        minimum_size (int, optional): Minimum size of layout. Defaults to 1.
+        ratio (int, optional): Optional ratio for flexible layout. Defaults to 1.
+        visible (bool, optional): Visibility of layout. Defaults to True.
+    """
+    splitters = {"row": RowSplitter, "column": ColumnSplitter}
+    def __init__(
+        self,
+        renderable: Optional[RenderableType] = None,
+        *,
+        name: Optional[str] = None,
+        size: Optional[int] = None,
+        minimum_size: int = 1,
+        ratio: int = 1,
+        visible: bool = True,
+    ) -> None:
+        self._renderable = renderable or _Placeholder(self)
+        self.size = size
+        self.minimum_size = minimum_size
+        self.ratio = ratio
+        self.name = name
+        self.visible = visible
+        self.splitter: Splitter = self.splitters["column"]()
+        self._children: List[Layout] = []
+        self._render_map: RenderMap = {}
+        self._lock = RLock()
+    def __rich_repr__(self) -> Result:
+        yield "name", self.name, None
+        yield "size", self.size, None
+        yield "minimum_size", self.minimum_size, 1
+        yield "ratio", self.ratio, 1
+    @property
+    def renderable(self) -> RenderableType:
+        """Layout renderable."""
+        return self if self._children else self._renderable
+    @property
+    def children(self) -> List["Layout"]:
+        """Gets (visible) layout children."""
+        return [child for child in self._children if child.visible]
+    @property
+    def map(self) -> RenderMap:
+        """Get a map of the last render."""
+        return self._render_map
+    def get(self, name: str) -> Optional["Layout"]:
+        """Get a named layout, or None if it doesn't exist.
+        Args:
+            name (str): Name of layout.
+        Returns:
+            Optional[Layout]: Layout instance or None if no layout was found.
+        """
+        if self.name == name:
+            return self
+        else:
+            for child in self._children:
+                named_layout = child.get(name)
+                if named_layout is not None:
+                    return named_layout
+        return None
+    def __getitem__(self, name: str) -> "Layout":
+        layout = self.get(name)
+        if layout is None:
+            raise KeyError(f"No layout with name {name!r}")
+        return layout
+    @property
+    def tree(self) -> "Tree":
+        """Get a tree renderable to show layout structure."""
+        from pip._vendor.rich.styled import Styled
+        from pip._vendor.rich.table import Table
+        from pip._vendor.rich.tree import Tree
+        def summary(layout: "Layout") -> Table:
+            icon = layout.splitter.get_tree_icon()
+            table = Table.grid(padding=(0, 1, 0, 0))
+            text: RenderableType = (
+                Pretty(layout) if layout.visible else Styled(Pretty(layout), "dim")
+            )
+            table.add_row(icon, text)
+            _summary = table
+            return _summary
+        layout = self
+        tree = Tree(
+            summary(layout),
+            guide_style=f"layout.tree.{layout.splitter.name}",
+            highlight=True,
+        )
+        def recurse(tree: "Tree", layout: "Layout") -> None:
+            for child in layout._children:
+                recurse(
+                    tree.add(
+                        summary(child),
+                        guide_style=f"layout.tree.{child.splitter.name}",
+                    ),
+                    child,
+                )
+        recurse(tree, self)
+        return tree
+    def split(
+        self,
+        *layouts: Union["Layout", RenderableType],
+        splitter: Union[Splitter, str] = "column",
+    ) -> None:
+        """Split the layout in to multiple sub-layouts.
+        Args:
+            *layouts (Layout): Positional arguments should be (sub) Layout instances.
+            splitter (Union[Splitter, str]): Splitter instance or name of splitter.
+        """
+        _layouts = [
+            layout if isinstance(layout, Layout) else Layout(layout)
+            for layout in layouts
+        ]
+        try:
+            self.splitter = (
+                splitter
+                if isinstance(splitter, Splitter)
+                else self.splitters[splitter]()
+            )
+        except KeyError:
+            raise NoSplitter(f"No splitter called {splitter!r}")
+        self._children[:] = _layouts
+    def add_split(self, *layouts: Union["Layout", RenderableType]) -> None:
+        """Add a new layout(s) to existing split.
+        Args:
+            *layouts (Union[Layout, RenderableType]): Positional arguments should be renderables or (sub) Layout instances.
+        """
+        _layouts = (
+            layout if isinstance(layout, Layout) else Layout(layout)
+            for layout in layouts
+        )
+        self._children.extend(_layouts)
+    def split_row(self, *layouts: Union["Layout", RenderableType]) -> None:
+        """Split the layout in to a row (layouts side by side).
+        Args:
+            *layouts (Layout): Positional arguments should be (sub) Layout instances.
+        """
+        self.split(*layouts, splitter="row")
+    def split_column(self, *layouts: Union["Layout", RenderableType]) -> None:
+        """Split the layout in to a column (layouts stacked on top of each other).
+        Args:
+            *layouts (Layout): Positional arguments should be (sub) Layout instances.
+        """
+        self.split(*layouts, splitter="column")
+    def unsplit(self) -> None:
+        """Reset splits to initial state."""
+        del self._children[:]
+    def update(self, renderable: RenderableType) -> None:
+        """Update renderable.
+        Args:
+            renderable (RenderableType): New renderable object.
+        """
+        with self._lock:
+            self._renderable = renderable
+    def refresh_screen(self, console: "Console", layout_name: str) -> None:
+        """Refresh a sub-layout.
+        Args:
+            console (Console): Console instance where Layout is to be rendered.
+            layout_name (str): Name of layout.
+        """
+        with self._lock:
+            layout = self[layout_name]
+            region, _lines = self._render_map[layout]
+            (x, y, width, height) = region
+            lines = console.render_lines(
+                layout, console.options.update_dimensions(width, height)
+            )
+            self._render_map[layout] = LayoutRender(region, lines)
+            console.update_screen_lines(lines, x, y)
+    def _make_region_map(self, width: int, height: int) -> RegionMap:
+        """Create a dict that maps layout on to Region."""
+        stack: List[Tuple[Layout, Region]] = [(self, Region(0, 0, width, height))]
+        push = stack.append
+        pop = stack.pop
+        layout_regions: List[Tuple[Layout, Region]] = []
+        append_layout_region = layout_regions.append
+        while stack:
+            append_layout_region(pop())
+            layout, region = layout_regions[-1]
+            children = layout.children
+            if children:
+                for child_and_region in layout.splitter.divide(children, region):
+                    push(child_and_region)
+        region_map = {
+            layout: region
+            for layout, region in sorted(layout_regions, key=itemgetter(1))
+        }
+        return region_map
+    def render(self, console: Console, options: ConsoleOptions) -> RenderMap:
+        """Render the sub_layouts.
+        Args:
+            console (Console): Console instance.
+            options (ConsoleOptions): Console options.
+        Returns:
+            RenderMap: A dict that maps Layout on to a tuple of Region, lines
+        """
+        render_width = options.max_width
+        render_height = options.height or console.height
+        region_map = self._make_region_map(render_width, render_height)
+        layout_regions = [
+            (layout, region)
+            for layout, region in region_map.items()
+            if not layout.children
+        ]
+        render_map: Dict["Layout", "LayoutRender"] = {}
+        render_lines = console.render_lines
+        update_dimensions = options.update_dimensions
+        for layout, region in layout_regions:
+            lines = render_lines(
+                layout.renderable, update_dimensions(region.width, region.height)
+            )
+            render_map[layout] = LayoutRender(region, lines)
+        return render_map
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        with self._lock:
+            width = options.max_width or console.width
+            height = options.height or console.height
+            render_map = self.render(console, options.update_dimensions(width, height))
+            self._render_map = render_map
+            layout_lines: List[List[Segment]] = [[] for _ in range(height)]
+            _islice = islice
+            for region, lines in render_map.values():
+                _x, y, _layout_width, layout_height = region
+                for row, line in zip(
+                    _islice(layout_lines, y, y + layout_height), lines
+                ):
+                    row.extend(line)
+            new_line = Segment.line()
+            for layout_row in layout_lines:
+                yield from layout_row
+                yield new_line
+if __name__ == "__main__":
+    from pip._vendor.rich.console import Console
+    console = Console()
+    layout = Layout()
+    layout.split_column(
+        Layout(name="header", size=3),
+        Layout(ratio=1, name="main"),
+        Layout(size=10, name="footer"),
+    )
+    layout["main"].split_row(Layout(name="side"), Layout(name="body", ratio=2))
+    layout["body"].split_row(Layout(name="content", ratio=2), Layout(name="s2"))
+    layout["s2"].split_column(
+        Layout(name="top"), Layout(name="middle"), Layout(name="bottom")
+    )
+    layout["side"].split_column(Layout(layout.tree, name="left1"), Layout(name="left2"))
+    layout["content"].update("foo")
+    console.print(layout)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/progress_bar.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import math
+from functools import lru_cache
+from time import monotonic
+from typing import Iterable, List, Optional
+from .color import Color, blend_rgb
+from .color_triplet import ColorTriplet
+from .console import Console, ConsoleOptions, RenderResult
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .segment import Segment
+from .style import Style, StyleType
+# Number of characters before 'pulse' animation repeats
+PULSE_SIZE = 20
+class ProgressBar(JupyterMixin):
+    """Renders a (progress) bar. Used by rich.progress.
+    Args:
+        total (float, optional): Number of steps in the bar. Defaults to 100. Set to None to render a pulsing animation.
+        completed (float, optional): Number of steps completed. Defaults to 0.
+        width (int, optional): Width of the bar, or ``None`` for maximum width. Defaults to None.
+        pulse (bool, optional): Enable pulse effect. Defaults to False. Will pulse if a None total was passed.
+        style (StyleType, optional): Style for the bar background. Defaults to "bar.back".
+        complete_style (StyleType, optional): Style for the completed bar. Defaults to "bar.complete".
+        finished_style (StyleType, optional): Style for a finished bar. Defaults to "bar.finished".
+        pulse_style (StyleType, optional): Style for pulsing bars. Defaults to "bar.pulse".
+        animation_time (Optional[float], optional): Time in seconds to use for animation, or None to use system time.
+    """
+    def __init__(
+        self,
+        total: Optional[float] = 100.0,
+        completed: float = 0,
+        width: Optional[int] = None,
+        pulse: bool = False,
+        style: StyleType = "bar.back",
+        complete_style: StyleType = "bar.complete",
+        finished_style: StyleType = "bar.finished",
+        pulse_style: StyleType = "bar.pulse",
+        animation_time: Optional[float] = None,
+    ):
+        self.total = total
+        self.completed = completed
+        self.width = width
+        self.pulse = pulse
+        self.style = style
+        self.complete_style = complete_style
+        self.finished_style = finished_style
+        self.pulse_style = pulse_style
+        self.animation_time = animation_time
+        self._pulse_segments: Optional[List[Segment]] = None
+    def __repr__(self) -> str:
+        return f"<Bar {self.completed!r} of {self.total!r}>"
+    @property
+    def percentage_completed(self) -> Optional[float]:
+        """Calculate percentage complete."""
+        if self.total is None:
+            return None
+        completed = (self.completed / self.total) * 100.0
+        completed = min(100, max(0.0, completed))
+        return completed
+    @lru_cache(maxsize=16)
+    def _get_pulse_segments(
+        self,
+        fore_style: Style,
+        back_style: Style,
+        color_system: str,
+        no_color: bool,
+        ascii: bool = False,
+    ) -> List[Segment]:
+        """Get a list of segments to render a pulse animation.
+        Returns:
+            List[Segment]: A list of segments, one segment per character.
+        """
+        bar = "-" if ascii else "━"
+        segments: List[Segment] = []
+        if color_system not in ("standard", "eight_bit", "truecolor") or no_color:
+            segments += [Segment(bar, fore_style)] * (PULSE_SIZE // 2)
+            segments += [Segment(" " if no_color else bar, back_style)] * (
+                PULSE_SIZE - (PULSE_SIZE // 2)
+            )
+            return segments
+        append = segments.append
+        fore_color = (
+            fore_style.color.get_truecolor()
+            if fore_style.color
+            else ColorTriplet(255, 0, 255)
+        )
+        back_color = (
+            back_style.color.get_truecolor()
+            if back_style.color
+            else ColorTriplet(0, 0, 0)
+        )
+        cos = math.cos
+        pi = math.pi
+        _Segment = Segment
+        _Style = Style
+        from_triplet = Color.from_triplet
+        for index in range(PULSE_SIZE):
+            position = index / PULSE_SIZE
+            fade = 0.5 + cos((position * pi * 2)) / 2.0
+            color = blend_rgb(fore_color, back_color, cross_fade=fade)
+            append(_Segment(bar, _Style(color=from_triplet(color))))
+        return segments
+    def update(self, completed: float, total: Optional[float] = None) -> None:
+        """Update progress with new values.
+        Args:
+            completed (float): Number of steps completed.
+            total (float, optional): Total number of steps, or ``None`` to not change. Defaults to None.
+        """
+        self.completed = completed
+        self.total = total if total is not None else self.total
+    def _render_pulse(
+        self, console: Console, width: int, ascii: bool = False
+    ) -> Iterable[Segment]:
+        """Renders the pulse animation.
+        Args:
+            console (Console): Console instance.
+            width (int): Width in characters of pulse animation.
+        Returns:
+            RenderResult: [description]
+        Yields:
+            Iterator[Segment]: Segments to render pulse
+        """
+        fore_style = console.get_style(self.pulse_style, default="white")
+        back_style = console.get_style(self.style, default="black")
+        pulse_segments = self._get_pulse_segments(
+            fore_style, back_style, console.color_system, console.no_color, ascii=ascii
+        )
+        segment_count = len(pulse_segments)
+        current_time = (
+            monotonic() if self.animation_time is None else self.animation_time
+        )
+        segments = pulse_segments * (int(width / segment_count) + 2)
+        offset = int(-current_time * 15) % segment_count
+        segments = segments[offset : offset + width]
+        yield from segments
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        width = min(self.width or options.max_width, options.max_width)
+        ascii = options.legacy_windows or options.ascii_only
+        should_pulse = self.pulse or self.total is None
+        if should_pulse:
+            yield from self._render_pulse(console, width, ascii=ascii)
+            return
+        completed: Optional[float] = (
+            min(self.total, max(0, self.completed)) if self.total is not None else None
+        )
+        bar = "-" if ascii else "━"
+        half_bar_right = " " if ascii else "╸"
+        half_bar_left = " " if ascii else "╺"
+        complete_halves = (
+            int(width * 2 * completed / self.total)
+            if self.total and completed is not None
+            else width * 2
+        )
+        bar_count = complete_halves // 2
+        half_bar_count = complete_halves % 2
+        style = console.get_style(self.style)
+        is_finished = self.total is None or self.completed >= self.total
+        complete_style = console.get_style(
+            self.finished_style if is_finished else self.complete_style
+        )
+        _Segment = Segment
+        if bar_count:
+            yield _Segment(bar * bar_count, complete_style)
+        if half_bar_count:
+            yield _Segment(half_bar_right * half_bar_count, complete_style)
+        if not console.no_color:
+            remaining_bars = width - bar_count - half_bar_count
+            if remaining_bars and console.color_system is not None:
+                if not half_bar_count and bar_count:
+                    yield _Segment(half_bar_left, style)
+                    remaining_bars -= 1
+                if remaining_bars:
+                    yield _Segment(bar * remaining_bars, style)
+    def __rich_measure__(
+        self, console: Console, options: ConsoleOptions
+    ) -> Measurement:
+        return (
+            Measurement(self.width, self.width)
+            if self.width is not None
+            else Measurement(4, options.max_width)
+        )
+if __name__ == "__main__":  # pragma: no cover
+    console = Console()
+    bar = ProgressBar(width=50, total=100)
+    import time
+    console.show_cursor(False)
+    for n in range(0, 101, 1):
+        bar.update(n)
+        console.print(bar)
+        console.file.write("\r")
+        time.sleep(0.05)
+    console.show_cursor(True)
+    console.print()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/syntax.py ADDED Viewed

	@@ -0,0 +1,958 @@

+import os.path
+import platform
+import re
+import sys
+import textwrap
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+from pip._vendor.pygments.lexer import Lexer
+from pip._vendor.pygments.lexers import get_lexer_by_name, guess_lexer_for_filename
+from pip._vendor.pygments.style import Style as PygmentsStyle
+from pip._vendor.pygments.styles import get_style_by_name
+from pip._vendor.pygments.token import (
+    Comment,
+    Error,
+    Generic,
+    Keyword,
+    Name,
+    Number,
+    Operator,
+    String,
+    Token,
+    Whitespace,
+)
+from pip._vendor.pygments.util import ClassNotFound
+from pip._vendor.rich.containers import Lines
+from pip._vendor.rich.padding import Padding, PaddingDimensions
+from ._loop import loop_first
+from .cells import cell_len
+from .color import Color, blend_rgb
+from .console import Console, ConsoleOptions, JustifyMethod, RenderResult
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .segment import Segment, Segments
+from .style import Style, StyleType
+from .text import Text
+TokenType = Tuple[str, ...]
+WINDOWS = platform.system() == "Windows"
+DEFAULT_THEME = "monokai"
+# The following styles are based on https://github.com/pygments/pygments/blob/master/pygments/formatters/terminal.py
+# A few modifications were made
+ANSI_LIGHT: Dict[TokenType, Style] = {
+    Token: Style(),
+    Whitespace: Style(color="white"),
+    Comment: Style(dim=True),
+    Comment.Preproc: Style(color="cyan"),
+    Keyword: Style(color="blue"),
+    Keyword.Type: Style(color="cyan"),
+    Operator.Word: Style(color="magenta"),
+    Name.Builtin: Style(color="cyan"),
+    Name.Function: Style(color="green"),
+    Name.Namespace: Style(color="cyan", underline=True),
+    Name.Class: Style(color="green", underline=True),
+    Name.Exception: Style(color="cyan"),
+    Name.Decorator: Style(color="magenta", bold=True),
+    Name.Variable: Style(color="red"),
+    Name.Constant: Style(color="red"),
+    Name.Attribute: Style(color="cyan"),
+    Name.Tag: Style(color="bright_blue"),
+    String: Style(color="yellow"),
+    Number: Style(color="blue"),
+    Generic.Deleted: Style(color="bright_red"),
+    Generic.Inserted: Style(color="green"),
+    Generic.Heading: Style(bold=True),
+    Generic.Subheading: Style(color="magenta", bold=True),
+    Generic.Prompt: Style(bold=True),
+    Generic.Error: Style(color="bright_red"),
+    Error: Style(color="red", underline=True),
+}
+ANSI_DARK: Dict[TokenType, Style] = {
+    Token: Style(),
+    Whitespace: Style(color="bright_black"),
+    Comment: Style(dim=True),
+    Comment.Preproc: Style(color="bright_cyan"),
+    Keyword: Style(color="bright_blue"),
+    Keyword.Type: Style(color="bright_cyan"),
+    Operator.Word: Style(color="bright_magenta"),
+    Name.Builtin: Style(color="bright_cyan"),
+    Name.Function: Style(color="bright_green"),
+    Name.Namespace: Style(color="bright_cyan", underline=True),
+    Name.Class: Style(color="bright_green", underline=True),
+    Name.Exception: Style(color="bright_cyan"),
+    Name.Decorator: Style(color="bright_magenta", bold=True),
+    Name.Variable: Style(color="bright_red"),
+    Name.Constant: Style(color="bright_red"),
+    Name.Attribute: Style(color="bright_cyan"),
+    Name.Tag: Style(color="bright_blue"),
+    String: Style(color="yellow"),
+    Number: Style(color="bright_blue"),
+    Generic.Deleted: Style(color="bright_red"),
+    Generic.Inserted: Style(color="bright_green"),
+    Generic.Heading: Style(bold=True),
+    Generic.Subheading: Style(color="bright_magenta", bold=True),
+    Generic.Prompt: Style(bold=True),
+    Generic.Error: Style(color="bright_red"),
+    Error: Style(color="red", underline=True),
+}
+RICH_SYNTAX_THEMES = {"ansi_light": ANSI_LIGHT, "ansi_dark": ANSI_DARK}
+NUMBERS_COLUMN_DEFAULT_PADDING = 2
+class SyntaxTheme(ABC):
+    """Base class for a syntax theme."""
+    @abstractmethod
+    def get_style_for_token(self, token_type: TokenType) -> Style:
+        """Get a style for a given Pygments token."""
+        raise NotImplementedError  # pragma: no cover
+    @abstractmethod
+    def get_background_style(self) -> Style:
+        """Get the background color."""
+        raise NotImplementedError  # pragma: no cover
+class PygmentsSyntaxTheme(SyntaxTheme):
+    """Syntax theme that delegates to Pygments theme."""
+    def __init__(self, theme: Union[str, Type[PygmentsStyle]]) -> None:
+        self._style_cache: Dict[TokenType, Style] = {}
+        if isinstance(theme, str):
+            try:
+                self._pygments_style_class = get_style_by_name(theme)
+            except ClassNotFound:
+                self._pygments_style_class = get_style_by_name("default")
+        else:
+            self._pygments_style_class = theme
+        self._background_color = self._pygments_style_class.background_color
+        self._background_style = Style(bgcolor=self._background_color)
+    def get_style_for_token(self, token_type: TokenType) -> Style:
+        """Get a style from a Pygments class."""
+        try:
+            return self._style_cache[token_type]
+        except KeyError:
+            try:
+                pygments_style = self._pygments_style_class.style_for_token(token_type)
+            except KeyError:
+                style = Style.null()
+            else:
+                color = pygments_style["color"]
+                bgcolor = pygments_style["bgcolor"]
+                style = Style(
+                    color="#" + color if color else "#000000",
+                    bgcolor="#" + bgcolor if bgcolor else self._background_color,
+                    bold=pygments_style["bold"],
+                    italic=pygments_style["italic"],
+                    underline=pygments_style["underline"],
+                )
+            self._style_cache[token_type] = style
+        return style
+    def get_background_style(self) -> Style:
+        return self._background_style
+class ANSISyntaxTheme(SyntaxTheme):
+    """Syntax theme to use standard colors."""
+    def __init__(self, style_map: Dict[TokenType, Style]) -> None:
+        self.style_map = style_map
+        self._missing_style = Style.null()
+        self._background_style = Style.null()
+        self._style_cache: Dict[TokenType, Style] = {}
+    def get_style_for_token(self, token_type: TokenType) -> Style:
+        """Look up style in the style map."""
+        try:
+            return self._style_cache[token_type]
+        except KeyError:
+            # Styles form a hierarchy
+            # We need to go from most to least specific
+            # e.g. ("foo", "bar", "baz") to ("foo", "bar")  to ("foo",)
+            get_style = self.style_map.get
+            token = tuple(token_type)
+            style = self._missing_style
+            while token:
+                _style = get_style(token)
+                if _style is not None:
+                    style = _style
+                    break
+                token = token[:-1]
+            self._style_cache[token_type] = style
+            return style
+    def get_background_style(self) -> Style:
+        return self._background_style
+SyntaxPosition = Tuple[int, int]
+class _SyntaxHighlightRange(NamedTuple):
+    """
+    A range to highlight in a Syntax object.
+    `start` and `end` are 2-integers tuples, where the first integer is the line number
+    (starting from 1) and the second integer is the column index (starting from 0).
+    """
+    style: StyleType
+    start: SyntaxPosition
+    end: SyntaxPosition
+class Syntax(JupyterMixin):
+    """Construct a Syntax object to render syntax highlighted code.
+    Args:
+        code (str): Code to highlight.
+        lexer (Lexer | str): Lexer to use (see https://pygments.org/docs/lexers/)
+        theme (str, optional): Color theme, aka Pygments style (see https://pygments.org/docs/styles/#getting-a-list-of-available-styles). Defaults to "monokai".
+        dedent (bool, optional): Enable stripping of initial whitespace. Defaults to False.
+        line_numbers (bool, optional): Enable rendering of line numbers. Defaults to False.
+        start_line (int, optional): Starting number for line numbers. Defaults to 1.
+        line_range (Tuple[int | None, int | None], optional): If given should be a tuple of the start and end line to render.
+            A value of None in the tuple indicates the range is open in that direction.
+        highlight_lines (Set[int]): A set of line numbers to highlight.
+        code_width: Width of code to render (not including line numbers), or ``None`` to use all available width.
+        tab_size (int, optional): Size of tabs. Defaults to 4.
+        word_wrap (bool, optional): Enable word wrapping.
+        background_color (str, optional): Optional background color, or None to use theme color. Defaults to None.
+        indent_guides (bool, optional): Show indent guides. Defaults to False.
+        padding (PaddingDimensions): Padding to apply around the syntax. Defaults to 0 (no padding).
+    """
+    _pygments_style_class: Type[PygmentsStyle]
+    _theme: SyntaxTheme
+    @classmethod
+    def get_theme(cls, name: Union[str, SyntaxTheme]) -> SyntaxTheme:
+        """Get a syntax theme instance."""
+        if isinstance(name, SyntaxTheme):
+            return name
+        theme: SyntaxTheme
+        if name in RICH_SYNTAX_THEMES:
+            theme = ANSISyntaxTheme(RICH_SYNTAX_THEMES[name])
+        else:
+            theme = PygmentsSyntaxTheme(name)
+        return theme
+    def __init__(
+        self,
+        code: str,
+        lexer: Union[Lexer, str],
+        *,
+        theme: Union[str, SyntaxTheme] = DEFAULT_THEME,
+        dedent: bool = False,
+        line_numbers: bool = False,
+        start_line: int = 1,
+        line_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
+        highlight_lines: Optional[Set[int]] = None,
+        code_width: Optional[int] = None,
+        tab_size: int = 4,
+        word_wrap: bool = False,
+        background_color: Optional[str] = None,
+        indent_guides: bool = False,
+        padding: PaddingDimensions = 0,
+    ) -> None:
+        self.code = code
+        self._lexer = lexer
+        self.dedent = dedent
+        self.line_numbers = line_numbers
+        self.start_line = start_line
+        self.line_range = line_range
+        self.highlight_lines = highlight_lines or set()
+        self.code_width = code_width
+        self.tab_size = tab_size
+        self.word_wrap = word_wrap
+        self.background_color = background_color
+        self.background_style = (
+            Style(bgcolor=background_color) if background_color else Style()
+        )
+        self.indent_guides = indent_guides
+        self.padding = padding
+        self._theme = self.get_theme(theme)
+        self._stylized_ranges: List[_SyntaxHighlightRange] = []
+    @classmethod
+    def from_path(
+        cls,
+        path: str,
+        encoding: str = "utf-8",
+        lexer: Optional[Union[Lexer, str]] = None,
+        theme: Union[str, SyntaxTheme] = DEFAULT_THEME,
+        dedent: bool = False,
+        line_numbers: bool = False,
+        line_range: Optional[Tuple[int, int]] = None,
+        start_line: int = 1,
+        highlight_lines: Optional[Set[int]] = None,
+        code_width: Optional[int] = None,
+        tab_size: int = 4,
+        word_wrap: bool = False,
+        background_color: Optional[str] = None,
+        indent_guides: bool = False,
+        padding: PaddingDimensions = 0,
+    ) -> "Syntax":
+        """Construct a Syntax object from a file.
+        Args:
+            path (str): Path to file to highlight.
+            encoding (str): Encoding of file.
+            lexer (str | Lexer, optional): Lexer to use. If None, lexer will be auto-detected from path/file content.
+            theme (str, optional): Color theme, aka Pygments style (see https://pygments.org/docs/styles/#getting-a-list-of-available-styles). Defaults to "emacs".
+            dedent (bool, optional): Enable stripping of initial whitespace. Defaults to True.
+            line_numbers (bool, optional): Enable rendering of line numbers. Defaults to False.
+            start_line (int, optional): Starting number for line numbers. Defaults to 1.
+            line_range (Tuple[int, int], optional): If given should be a tuple of the start and end line to render.
+            highlight_lines (Set[int]): A set of line numbers to highlight.
+            code_width: Width of code to render (not including line numbers), or ``None`` to use all available width.
+            tab_size (int, optional): Size of tabs. Defaults to 4.
+            word_wrap (bool, optional): Enable word wrapping of code.
+            background_color (str, optional): Optional background color, or None to use theme color. Defaults to None.
+            indent_guides (bool, optional): Show indent guides. Defaults to False.
+            padding (PaddingDimensions): Padding to apply around the syntax. Defaults to 0 (no padding).
+        Returns:
+            [Syntax]: A Syntax object that may be printed to the console
+        """
+        code = Path(path).read_text(encoding=encoding)
+        if not lexer:
+            lexer = cls.guess_lexer(path, code=code)
+        return cls(
+            code,
+            lexer,
+            theme=theme,
+            dedent=dedent,
+            line_numbers=line_numbers,
+            line_range=line_range,
+            start_line=start_line,
+            highlight_lines=highlight_lines,
+            code_width=code_width,
+            tab_size=tab_size,
+            word_wrap=word_wrap,
+            background_color=background_color,
+            indent_guides=indent_guides,
+            padding=padding,
+        )
+    @classmethod
+    def guess_lexer(cls, path: str, code: Optional[str] = None) -> str:
+        """Guess the alias of the Pygments lexer to use based on a path and an optional string of code.
+        If code is supplied, it will use a combination of the code and the filename to determine the
+        best lexer to use. For example, if the file is ``index.html`` and the file contains Django
+        templating syntax, then "html+django" will be returned. If the file is ``index.html``, and no
+        templating language is used, the "html" lexer will be used. If no string of code
+        is supplied, the lexer will be chosen based on the file extension..
+        Args:
+             path (AnyStr): The path to the file containing the code you wish to know the lexer for.
+             code (str, optional): Optional string of code that will be used as a fallback if no lexer
+                is found for the supplied path.
+        Returns:
+            str: The name of the Pygments lexer that best matches the supplied path/code.
+        """
+        lexer: Optional[Lexer] = None
+        lexer_name = "default"
+        if code:
+            try:
+                lexer = guess_lexer_for_filename(path, code)
+            except ClassNotFound:
+                pass
+        if not lexer:
+            try:
+                _, ext = os.path.splitext(path)
+                if ext:
+                    extension = ext.lstrip(".").lower()
+                    lexer = get_lexer_by_name(extension)
+            except ClassNotFound:
+                pass
+        if lexer:
+            if lexer.aliases:
+                lexer_name = lexer.aliases[0]
+            else:
+                lexer_name = lexer.name
+        return lexer_name
+    def _get_base_style(self) -> Style:
+        """Get the base style."""
+        default_style = self._theme.get_background_style() + self.background_style
+        return default_style
+    def _get_token_color(self, token_type: TokenType) -> Optional[Color]:
+        """Get a color (if any) for the given token.
+        Args:
+            token_type (TokenType): A token type tuple from Pygments.
+        Returns:
+            Optional[Color]: Color from theme, or None for no color.
+        """
+        style = self._theme.get_style_for_token(token_type)
+        return style.color
+    @property
+    def lexer(self) -> Optional[Lexer]:
+        """The lexer for this syntax, or None if no lexer was found.
+        Tries to find the lexer by name if a string was passed to the constructor.
+        """
+        if isinstance(self._lexer, Lexer):
+            return self._lexer
+        try:
+            return get_lexer_by_name(
+                self._lexer,
+                stripnl=False,
+                ensurenl=True,
+                tabsize=self.tab_size,
+            )
+        except ClassNotFound:
+            return None
+    @property
+    def default_lexer(self) -> Lexer:
+        """A Pygments Lexer to use if one is not specified or invalid."""
+        return get_lexer_by_name(
+            "text",
+            stripnl=False,
+            ensurenl=True,
+            tabsize=self.tab_size,
+        )
+    def highlight(
+        self,
+        code: str,
+        line_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
+    ) -> Text:
+        """Highlight code and return a Text instance.
+        Args:
+            code (str): Code to highlight.
+            line_range(Tuple[int, int], optional): Optional line range to highlight.
+        Returns:
+            Text: A text instance containing highlighted syntax.
+        """
+        base_style = self._get_base_style()
+        justify: JustifyMethod = (
+            "default" if base_style.transparent_background else "left"
+        )
+        text = Text(
+            justify=justify,
+            style=base_style,
+            tab_size=self.tab_size,
+            no_wrap=not self.word_wrap,
+        )
+        _get_theme_style = self._theme.get_style_for_token
+        lexer = self.lexer or self.default_lexer
+        if lexer is None:
+            text.append(code)
+        else:
+            if line_range:
+                # More complicated path to only stylize a portion of the code
+                # This speeds up further operations as there are less spans to process
+                line_start, line_end = line_range
+                def line_tokenize() -> Iterable[Tuple[Any, str]]:
+                    """Split tokens to one per line."""
+                    assert lexer  # required to make MyPy happy - we know lexer is not None at this point
+                    for token_type, token in lexer.get_tokens(code):
+                        while token:
+                            line_token, new_line, token = token.partition("\n")
+                            yield token_type, line_token + new_line
+                def tokens_to_spans() -> Iterable[Tuple[str, Optional[Style]]]:
+                    """Convert tokens to spans."""
+                    tokens = iter(line_tokenize())
+                    line_no = 0
+                    _line_start = line_start - 1 if line_start else 0
+                    # Skip over tokens until line start
+                    while line_no < _line_start:
+                        try:
+                            _token_type, token = next(tokens)
+                        except StopIteration:
+                            break
+                        yield (token, None)
+                        if token.endswith("\n"):
+                            line_no += 1
+                    # Generate spans until line end
+                    for token_type, token in tokens:
+                        yield (token, _get_theme_style(token_type))
+                        if token.endswith("\n"):
+                            line_no += 1
+                            if line_end and line_no >= line_end:
+                                break
+                text.append_tokens(tokens_to_spans())
+            else:
+                text.append_tokens(
+                    (token, _get_theme_style(token_type))
+                    for token_type, token in lexer.get_tokens(code)
+                )
+            if self.background_color is not None:
+                text.stylize(f"on {self.background_color}")
+        if self._stylized_ranges:
+            self._apply_stylized_ranges(text)
+        return text
+    def stylize_range(
+        self, style: StyleType, start: SyntaxPosition, end: SyntaxPosition
+    ) -> None:
+        """
+        Adds a custom style on a part of the code, that will be applied to the syntax display when it's rendered.
+        Line numbers are 1-based, while column indexes are 0-based.
+        Args:
+            style (StyleType): The style to apply.
+            start (Tuple[int, int]): The start of the range, in the form `[line number, column index]`.
+            end (Tuple[int, int]): The end of the range, in the form `[line number, column index]`.
+        """
+        self._stylized_ranges.append(_SyntaxHighlightRange(style, start, end))
+    def _get_line_numbers_color(self, blend: float = 0.3) -> Color:
+        background_style = self._theme.get_background_style() + self.background_style
+        background_color = background_style.bgcolor
+        if background_color is None or background_color.is_system_defined:
+            return Color.default()
+        foreground_color = self._get_token_color(Token.Text)
+        if foreground_color is None or foreground_color.is_system_defined:
+            return foreground_color or Color.default()
+        new_color = blend_rgb(
+            background_color.get_truecolor(),
+            foreground_color.get_truecolor(),
+            cross_fade=blend,
+        )
+        return Color.from_triplet(new_color)
+    @property
+    def _numbers_column_width(self) -> int:
+        """Get the number of characters used to render the numbers column."""
+        column_width = 0
+        if self.line_numbers:
+            column_width = (
+                len(str(self.start_line + self.code.count("\n")))
+                + NUMBERS_COLUMN_DEFAULT_PADDING
+            )
+        return column_width
+    def _get_number_styles(self, console: Console) -> Tuple[Style, Style, Style]:
+        """Get background, number, and highlight styles for line numbers."""
+        background_style = self._get_base_style()
+        if background_style.transparent_background:
+            return Style.null(), Style(dim=True), Style.null()
+        if console.color_system in ("256", "truecolor"):
+            number_style = Style.chain(
+                background_style,
+                self._theme.get_style_for_token(Token.Text),
+                Style(color=self._get_line_numbers_color()),
+                self.background_style,
+            )
+            highlight_number_style = Style.chain(
+                background_style,
+                self._theme.get_style_for_token(Token.Text),
+                Style(bold=True, color=self._get_line_numbers_color(0.9)),
+                self.background_style,
+            )
+        else:
+            number_style = background_style + Style(dim=True)
+            highlight_number_style = background_style + Style(dim=False)
+        return background_style, number_style, highlight_number_style
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "Measurement":
+        _, right, _, left = Padding.unpack(self.padding)
+        padding = left + right
+        if self.code_width is not None:
+            width = self.code_width + self._numbers_column_width + padding + 1
+            return Measurement(self._numbers_column_width, width)
+        lines = self.code.splitlines()
+        width = (
+            self._numbers_column_width
+            + padding
+            + (max(cell_len(line) for line in lines) if lines else 0)
+        )
+        if self.line_numbers:
+            width += 1
+        return Measurement(self._numbers_column_width, width)
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        segments = Segments(self._get_syntax(console, options))
+        if self.padding:
+            yield Padding(
+                segments, style=self._theme.get_background_style(), pad=self.padding
+            )
+        else:
+            yield segments
+    def _get_syntax(
+        self,
+        console: Console,
+        options: ConsoleOptions,
+    ) -> Iterable[Segment]:
+        """
+        Get the Segments for the Syntax object, excluding any vertical/horizontal padding
+        """
+        transparent_background = self._get_base_style().transparent_background
+        code_width = (
+            (
+                (options.max_width - self._numbers_column_width - 1)
+                if self.line_numbers
+                else options.max_width
+            )
+            if self.code_width is None
+            else self.code_width
+        )
+        ends_on_nl, processed_code = self._process_code(self.code)
+        text = self.highlight(processed_code, self.line_range)
+        if not self.line_numbers and not self.word_wrap and not self.line_range:
+            if not ends_on_nl:
+                text.remove_suffix("\n")
+            # Simple case of just rendering text
+            style = (
+                self._get_base_style()
+                + self._theme.get_style_for_token(Comment)
+                + Style(dim=True)
+                + self.background_style
+            )
+            if self.indent_guides and not options.ascii_only:
+                text = text.with_indent_guides(self.tab_size, style=style)
+                text.overflow = "crop"
+            if style.transparent_background:
+                yield from console.render(
+                    text, options=options.update(width=code_width)
+                )
+            else:
+                syntax_lines = console.render_lines(
+                    text,
+                    options.update(width=code_width, height=None, justify="left"),
+                    style=self.background_style,
+                    pad=True,
+                    new_lines=True,
+                )
+                for syntax_line in syntax_lines:
+                    yield from syntax_line
+            return
+        start_line, end_line = self.line_range or (None, None)
+        line_offset = 0
+        if start_line:
+            line_offset = max(0, start_line - 1)
+        lines: Union[List[Text], Lines] = text.split("\n", allow_blank=ends_on_nl)
+        if self.line_range:
+            if line_offset > len(lines):
+                return
+            lines = lines[line_offset:end_line]
+        if self.indent_guides and not options.ascii_only:
+            style = (
+                self._get_base_style()
+                + self._theme.get_style_for_token(Comment)
+                + Style(dim=True)
+                + self.background_style
+            )
+            lines = (
+                Text("\n")
+                .join(lines)
+                .with_indent_guides(self.tab_size, style=style + Style(italic=False))
+                .split("\n", allow_blank=True)
+            )
+        numbers_column_width = self._numbers_column_width
+        render_options = options.update(width=code_width)
+        highlight_line = self.highlight_lines.__contains__
+        _Segment = Segment
+        new_line = _Segment("\n")
+        line_pointer = "> " if options.legacy_windows else "❱ "
+        (
+            background_style,
+            number_style,
+            highlight_number_style,
+        ) = self._get_number_styles(console)
+        for line_no, line in enumerate(lines, self.start_line + line_offset):
+            if self.word_wrap:
+                wrapped_lines = console.render_lines(
+                    line,
+                    render_options.update(height=None, justify="left"),
+                    style=background_style,
+                    pad=not transparent_background,
+                )
+            else:
+                segments = list(line.render(console, end=""))
+                if options.no_wrap:
+                    wrapped_lines = [segments]
+                else:
+                    wrapped_lines = [
+                        _Segment.adjust_line_length(
+                            segments,
+                            render_options.max_width,
+                            style=background_style,
+                            pad=not transparent_background,
+                        )
+                    ]
+            if self.line_numbers:
+                wrapped_line_left_pad = _Segment(
+                    " " * numbers_column_width + " ", background_style
+                )
+                for first, wrapped_line in loop_first(wrapped_lines):
+                    if first:
+                        line_column = str(line_no).rjust(numbers_column_width - 2) + " "
+                        if highlight_line(line_no):
+                            yield _Segment(line_pointer, Style(color="red"))
+                            yield _Segment(line_column, highlight_number_style)
+                        else:
+                            yield _Segment("  ", highlight_number_style)
+                            yield _Segment(line_column, number_style)
+                    else:
+                        yield wrapped_line_left_pad
+                    yield from wrapped_line
+                    yield new_line
+            else:
+                for wrapped_line in wrapped_lines:
+                    yield from wrapped_line
+                    yield new_line
+    def _apply_stylized_ranges(self, text: Text) -> None:
+        """
+        Apply stylized ranges to a text instance,
+        using the given code to determine the right portion to apply the style to.
+        Args:
+            text (Text): Text instance to apply the style to.
+        """
+        code = text.plain
+        newlines_offsets = [
+            # Let's add outer boundaries at each side of the list:
+            0,
+            # N.B. using "\n" here is much faster than using metacharacters such as "^" or "\Z":
+            *[
+                match.start() + 1
+                for match in re.finditer("\n", code, flags=re.MULTILINE)
+            ],
+            len(code) + 1,
+        ]
+        for stylized_range in self._stylized_ranges:
+            start = _get_code_index_for_syntax_position(
+                newlines_offsets, stylized_range.start
+            )
+            end = _get_code_index_for_syntax_position(
+                newlines_offsets, stylized_range.end
+            )
+            if start is not None and end is not None:
+                text.stylize(stylized_range.style, start, end)
+    def _process_code(self, code: str) -> Tuple[bool, str]:
+        """
+        Applies various processing to a raw code string
+        (normalises it so it always ends with a line return, dedents it if necessary, etc.)
+        Args:
+            code (str): The raw code string to process
+        Returns:
+            Tuple[bool, str]: the boolean indicates whether the raw code ends with a line return,
+                while the string is the processed code.
+        """
+        ends_on_nl = code.endswith("\n")
+        processed_code = code if ends_on_nl else code + "\n"
+        processed_code = (
+            textwrap.dedent(processed_code) if self.dedent else processed_code
+        )
+        processed_code = processed_code.expandtabs(self.tab_size)
+        return ends_on_nl, processed_code
+def _get_code_index_for_syntax_position(
+    newlines_offsets: Sequence[int], position: SyntaxPosition
+) -> Optional[int]:
+    """
+    Returns the index of the code string for the given positions.
+    Args:
+        newlines_offsets (Sequence[int]): The offset of each newline character found in the code snippet.
+        position (SyntaxPosition): The position to search for.
+    Returns:
+        Optional[int]: The index of the code string for this position, or `None`
+            if the given position's line number is out of range (if it's the column that is out of range
+            we silently clamp its value so that it reaches the end of the line)
+    """
+    lines_count = len(newlines_offsets)
+    line_number, column_index = position
+    if line_number > lines_count or len(newlines_offsets) < (line_number + 1):
+        return None  # `line_number` is out of range
+    line_index = line_number - 1
+    line_length = newlines_offsets[line_index + 1] - newlines_offsets[line_index] - 1
+    # If `column_index` is out of range: let's silently clamp it:
+    column_index = min(line_length, column_index)
+    return newlines_offsets[line_index] + column_index
+if __name__ == "__main__":  # pragma: no cover
+    import argparse
+    import sys
+    parser = argparse.ArgumentParser(
+        description="Render syntax to the console with Rich"
+    )
+    parser.add_argument(
+        "path",
+        metavar="PATH",
+        help="path to file, or - for stdin",
+    )
+    parser.add_argument(
+        "-c",
+        "--force-color",
+        dest="force_color",
+        action="store_true",
+        default=None,
+        help="force color for non-terminals",
+    )
+    parser.add_argument(
+        "-i",
+        "--indent-guides",
+        dest="indent_guides",
+        action="store_true",
+        default=False,
+        help="display indent guides",
+    )
+    parser.add_argument(
+        "-l",
+        "--line-numbers",
+        dest="line_numbers",
+        action="store_true",
+        help="render line numbers",
+    )
+    parser.add_argument(
+        "-w",
+        "--width",
+        type=int,
+        dest="width",
+        default=None,
+        help="width of output (default will auto-detect)",
+    )
+    parser.add_argument(
+        "-r",
+        "--wrap",
+        dest="word_wrap",
+        action="store_true",
+        default=False,
+        help="word wrap long lines",
+    )
+    parser.add_argument(
+        "-s",
+        "--soft-wrap",
+        action="store_true",
+        dest="soft_wrap",
+        default=False,
+        help="enable soft wrapping mode",
+    )
+    parser.add_argument(
+        "-t", "--theme", dest="theme", default="monokai", help="pygments theme"
+    )
+    parser.add_argument(
+        "-b",
+        "--background-color",
+        dest="background_color",
+        default=None,
+        help="Override background color",
+    )
+    parser.add_argument(
+        "-x",
+        "--lexer",
+        default=None,
+        dest="lexer_name",
+        help="Lexer name",
+    )
+    parser.add_argument(
+        "-p", "--padding", type=int, default=0, dest="padding", help="Padding"
+    )
+    parser.add_argument(
+        "--highlight-line",
+        type=int,
+        default=None,
+        dest="highlight_line",
+        help="The line number (not index!) to highlight",
+    )
+    args = parser.parse_args()
+    from pip._vendor.rich.console import Console
+    console = Console(force_terminal=args.force_color, width=args.width)
+    if args.path == "-":
+        code = sys.stdin.read()
+        syntax = Syntax(
+            code=code,
+            lexer=args.lexer_name,
+            line_numbers=args.line_numbers,
+            word_wrap=args.word_wrap,
+            theme=args.theme,
+            background_color=args.background_color,
+            indent_guides=args.indent_guides,
+            padding=args.padding,
+            highlight_lines={args.highlight_line},
+        )
+    else:
+        syntax = Syntax.from_path(
+            args.path,
+            lexer=args.lexer_name,
+            line_numbers=args.line_numbers,
+            word_wrap=args.word_wrap,
+            theme=args.theme,
+            background_color=args.background_color,
+            indent_guides=args.indent_guides,
+            padding=args.padding,
+            highlight_lines={args.highlight_line},
+        )
+    console.print(syntax, soft_wrap=args.soft_wrap)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/tree.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from typing import Iterator, List, Optional, Tuple
+from ._loop import loop_first, loop_last
+from .console import Console, ConsoleOptions, RenderableType, RenderResult
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .segment import Segment
+from .style import Style, StyleStack, StyleType
+from .styled import Styled
+class Tree(JupyterMixin):
+    """A renderable for a tree structure.
+    Args:
+        label (RenderableType): The renderable or str for the tree label.
+        style (StyleType, optional): Style of this tree. Defaults to "tree".
+        guide_style (StyleType, optional): Style of the guide lines. Defaults to "tree.line".
+        expanded (bool, optional): Also display children. Defaults to True.
+        highlight (bool, optional): Highlight renderable (if str). Defaults to False.
+    """
+    def __init__(
+        self,
+        label: RenderableType,
+        *,
+        style: StyleType = "tree",
+        guide_style: StyleType = "tree.line",
+        expanded: bool = True,
+        highlight: bool = False,
+        hide_root: bool = False,
+    ) -> None:
+        self.label = label
+        self.style = style
+        self.guide_style = guide_style
+        self.children: List[Tree] = []
+        self.expanded = expanded
+        self.highlight = highlight
+        self.hide_root = hide_root
+    def add(
+        self,
+        label: RenderableType,
+        *,
+        style: Optional[StyleType] = None,
+        guide_style: Optional[StyleType] = None,
+        expanded: bool = True,
+        highlight: Optional[bool] = False,
+    ) -> "Tree":
+        """Add a child tree.
+        Args:
+            label (RenderableType): The renderable or str for the tree label.
+            style (StyleType, optional): Style of this tree. Defaults to "tree".
+            guide_style (StyleType, optional): Style of the guide lines. Defaults to "tree.line".
+            expanded (bool, optional): Also display children. Defaults to True.
+            highlight (Optional[bool], optional): Highlight renderable (if str). Defaults to False.
+        Returns:
+            Tree: A new child Tree, which may be further modified.
+        """
+        node = Tree(
+            label,
+            style=self.style if style is None else style,
+            guide_style=self.guide_style if guide_style is None else guide_style,
+            expanded=expanded,
+            highlight=self.highlight if highlight is None else highlight,
+        )
+        self.children.append(node)
+        return node
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        stack: List[Iterator[Tuple[bool, Tree]]] = []
+        pop = stack.pop
+        push = stack.append
+        new_line = Segment.line()
+        get_style = console.get_style
+        null_style = Style.null()
+        guide_style = get_style(self.guide_style, default="") or null_style
+        SPACE, CONTINUE, FORK, END = range(4)
+        ASCII_GUIDES = ("    ", "|   ", "+-- ", "`-- ")
+        TREE_GUIDES = [
+            ("    ", "│   ", "├── ", "└── "),
+            ("    ", "┃   ", "┣━━ ", "┗━━ "),
+            ("    ", "║   ", "╠══ ", "╚══ "),
+        ]
+        _Segment = Segment
+        def make_guide(index: int, style: Style) -> Segment:
+            """Make a Segment for a level of the guide lines."""
+            if options.ascii_only:
+                line = ASCII_GUIDES[index]
+            else:
+                guide = 1 if style.bold else (2 if style.underline2 else 0)
+                line = TREE_GUIDES[0 if options.legacy_windows else guide][index]
+            return _Segment(line, style)
+        levels: List[Segment] = [make_guide(CONTINUE, guide_style)]
+        push(iter(loop_last([self])))
+        guide_style_stack = StyleStack(get_style(self.guide_style))
+        style_stack = StyleStack(get_style(self.style))
+        remove_guide_styles = Style(bold=False, underline2=False)
+        depth = 0
+        while stack:
+            stack_node = pop()
+            try:
+                last, node = next(stack_node)
+            except StopIteration:
+                levels.pop()
+                if levels:
+                    guide_style = levels[-1].style or null_style
+                    levels[-1] = make_guide(FORK, guide_style)
+                    guide_style_stack.pop()
+                    style_stack.pop()
+                continue
+            push(stack_node)
+            if last:
+                levels[-1] = make_guide(END, levels[-1].style or null_style)
+            guide_style = guide_style_stack.current + get_style(node.guide_style)
+            style = style_stack.current + get_style(node.style)
+            prefix = levels[(2 if self.hide_root else 1) :]
+            renderable_lines = console.render_lines(
+                Styled(node.label, style),
+                options.update(
+                    width=options.max_width
+                    - sum(level.cell_length for level in prefix),
+                    highlight=self.highlight,
+                    height=None,
+                ),
+                pad=options.justify is not None,
+            )
+            if not (depth == 0 and self.hide_root):
+                for first, line in loop_first(renderable_lines):
+                    if prefix:
+                        yield from _Segment.apply_style(
+                            prefix,
+                            style.background_style,
+                            post_style=remove_guide_styles,
+                        )
+                    yield from line
+                    yield new_line
+                    if first and prefix:
+                        prefix[-1] = make_guide(
+                            SPACE if last else CONTINUE, prefix[-1].style or null_style
+                        )
+            if node.expanded and node.children:
+                levels[-1] = make_guide(
+                    SPACE if last else CONTINUE, levels[-1].style or null_style
+                )
+                levels.append(
+                    make_guide(END if len(node.children) == 1 else FORK, guide_style)
+                )
+                style_stack.push(get_style(node.style))
+                guide_style_stack.push(get_style(node.guide_style))
+                push(iter(loop_last(node.children)))
+                depth += 1
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "Measurement":
+        stack: List[Iterator[Tree]] = [iter([self])]
+        pop = stack.pop
+        push = stack.append
+        minimum = 0
+        maximum = 0
+        measure = Measurement.get
+        level = 0
+        while stack:
+            iter_tree = pop()
+            try:
+                tree = next(iter_tree)
+            except StopIteration:
+                level -= 1
+                continue
+            push(iter_tree)
+            min_measure, max_measure = measure(console, options, tree.label)
+            indent = level * 4
+            minimum = max(min_measure + indent, minimum)
+            maximum = max(max_measure + indent, maximum)
+            if tree.expanded and tree.children:
+                push(iter(tree.children))
+                level += 1
+        return Measurement(minimum, maximum)
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Group
+    from pip._vendor.rich.markdown import Markdown
+    from pip._vendor.rich.panel import Panel
+    from pip._vendor.rich.syntax import Syntax
+    from pip._vendor.rich.table import Table
+    table = Table(row_styles=["", "dim"])
+    table.add_column("Released", style="cyan", no_wrap=True)
+    table.add_column("Title", style="magenta")
+    table.add_column("Box Office", justify="right", style="green")
+    table.add_row("Dec 20, 2019", "Star Wars: The Rise of Skywalker", "$952,110,690")
+    table.add_row("May 25, 2018", "Solo: A Star Wars Story", "$393,151,347")
+    table.add_row("Dec 15, 2017", "Star Wars Ep. V111: The Last Jedi", "$1,332,539,889")
+    table.add_row("Dec 16, 2016", "Rogue One: A Star Wars Story", "$1,332,439,889")
+    code = """\
+class Segment(NamedTuple):
+    text: str = ""
+    style: Optional[Style] = None
+    is_control: bool = False
+"""
+    syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
+    markdown = Markdown(
+        """\
+### example.md
+> Hello, World!
+>
+> Markdown _all_ the things
+"""
+    )
+    root = Tree("🌲 [b green]Rich Tree", highlight=True, hide_root=True)
+    node = root.add(":file_folder: Renderables", guide_style="red")
+    simple_node = node.add(":file_folder: [bold yellow]Atomic", guide_style="uu green")
+    simple_node.add(Group("📄 Syntax", syntax))
+    simple_node.add(Group("📄 Markdown", Panel(markdown, border_style="green")))
+    containers_node = node.add(
+        ":file_folder: [bold magenta]Containers", guide_style="bold magenta"
+    )
+    containers_node.expanded = True
+    panel = Panel.fit("Just a panel", border_style="red")
+    containers_node.add(Group("📄 Panels", panel))
+    containers_node.add(Group("📄 [b magenta]Table", table))
+    console = Console()
+    console.print(root)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,220 @@

+Metadata-Version: 2.1
+Name: pybind11
+Version: 2.13.6
+Summary: Seamless operability between C++11 and Python
+Home-page: https://github.com/pybind/pybind11
+Download-URL: https://github.com/pybind/pybind11/tarball/v2.13.6
+Author: Wenzel Jakob
+Author-email: wenzel.jakob@epfl.ch
+License: BSD
+Project-URL: Documentation, https://pybind11.readthedocs.io/
+Project-URL: Bug Tracker, https://github.com/pybind/pybind11/issues
+Project-URL: Discussions, https://github.com/pybind/pybind11/discussions
+Project-URL: Changelog, https://pybind11.readthedocs.io/en/latest/changelog.html
+Project-URL: Chat, https://gitter.im/pybind/Lobby
+Keywords: C++11,Python bindings
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Utilities
+Classifier: Programming Language :: C++
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: C++
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.7
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+Provides-Extra: global
+Requires-Dist: pybind11-global==2.13.6; extra == "global"
+.. figure:: https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png
+   :alt: pybind11 logo
+**pybind11 — Seamless operability between C++11 and Python**
+|Latest Documentation Status| |Stable Documentation Status| |Gitter chat| |GitHub Discussions| |CI| |Build status|
+|Repology| |PyPI package| |Conda-forge| |Python Versions|
+`Setuptools example <https://github.com/pybind/python_example>`_
+• `Scikit-build example <https://github.com/pybind/scikit_build_example>`_
+• `CMake example <https://github.com/pybind/cmake_example>`_
+.. start
+**pybind11** is a lightweight header-only library that exposes C++ types
+in Python and vice versa, mainly to create Python bindings of existing
+C++ code. Its goals and syntax are similar to the excellent
+`Boost.Python <http://www.boost.org/doc/libs/1_58_0/libs/python/doc/>`_
+library by David Abrahams: to minimize boilerplate code in traditional
+extension modules by inferring type information using compile-time
+introspection.
+The main issue with Boost.Python—and the reason for creating such a
+similar project—is Boost. Boost is an enormously large and complex suite
+of utility libraries that works with almost every C++ compiler in
+existence. This compatibility has its cost: arcane template tricks and
+workarounds are necessary to support the oldest and buggiest of compiler
+specimens. Now that C++11-compatible compilers are widely available,
+this heavy machinery has become an excessively large and unnecessary
+dependency.
+Think of this library as a tiny self-contained version of Boost.Python
+with everything stripped away that isn't relevant for binding
+generation. Without comments, the core header files only require ~4K
+lines of code and depend on Python (3.7+, or PyPy) and the C++
+standard library. This compact implementation was possible thanks to
+some C++11 language features (specifically: tuples, lambda functions and
+variadic templates). Since its creation, this library has grown beyond
+Boost.Python in many ways, leading to dramatically simpler binding code in many
+common situations.
+Tutorial and reference documentation is provided at
+`pybind11.readthedocs.io <https://pybind11.readthedocs.io/en/latest>`_.
+A PDF version of the manual is available
+`here <https://pybind11.readthedocs.io/_/downloads/en/latest/pdf/>`_.
+And the source code is always available at
+`github.com/pybind/pybind11 <https://github.com/pybind/pybind11>`_.
+Core features
+-------------
+pybind11 can map the following core C++ features to Python:
+- Functions accepting and returning custom data structures per value,
+  reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended
+  in Python
+- Integrated NumPy support (NumPy 2 requires pybind11 2.12+)
+Goodies
+-------
+In addition to the core functionality, pybind11 provides some extra
+goodies:
+- Python 3.7+, and PyPy3 7.3 are supported with an implementation-agnostic
+  interface (pybind11 2.9 was the last version to support Python 2 and 3.5).
+- It is possible to bind C++11 lambda functions with captured
+  variables. The lambda capture data is stored inside the resulting
+  Python function object.
+- pybind11 uses C++11 move constructors and move assignment operators
+  whenever possible to efficiently transfer custom data types.
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion
+  between C++ matrix classes like Eigen and NumPy without expensive
+  copy operations.
+- pybind11 can automatically vectorize functions so that they are
+  transparently applied to all entries of one or more NumPy array
+  arguments.
+- Python's slice-based access and assignment operations can be
+  supported with just a few lines of code.
+- Everything is contained in just a few header files; there is no need
+  to link against any additional libraries.
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11
+  conversion of PyRosetta, an enormous Boost.Python binding project,
+  `reported <https://graylab.jhu.edu/Sergey/2016.RosettaCon/PyRosetta-4.pdf>`_
+  a binary size reduction of **5.4x** and compile time reduction by
+  **5.8x**.
+- Function signatures are precomputed at compile time (using
+  ``constexpr``), leading to smaller binaries.
+- With little extra effort, C++ types can be pickled and unpickled
+  similar to regular Python objects.
+Supported compilers
+-------------------
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or
+   newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2017 or newer
+4. Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI)
+5. Cygwin/GCC (previously tested on 2.5.1)
+6. NVCC (CUDA 11.0 tested in CI)
+7. NVIDIA PGI (20.9 tested in CI)
+About
+-----
+This project was created by `Wenzel
+Jakob <http://rgl.epfl.ch/people/wjakob>`_. Significant features and/or
+improvements to the code were contributed by Jonas Adler, Lori A. Burns,
+Sylvain Corlay, Eric Cousineau, Aaron Gokaslan, Ralf Grosse-Kunstleve, Trent Houliston, Axel
+Huebl, @hulucc, Yannick Jadoul, Sergey Lyskov, Johan Mabille, Tomasz Miąsko,
+Dean Moldovan, Ben Pritchard, Jason Rhinelander, Boris Schäling, Pim
+Schellart, Henry Schreiner, Ivan Smirnov, Boris Staletic, and Patrick Stewart.
+We thank Google for a generous financial contribution to the continuous
+integration infrastructure used by this project.
+Contributing
+~~~~~~~~~~~~
+See the `contributing
+guide <https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md>`_
+for information on building and contributing to pybind11.
+License
+~~~~~~~
+pybind11 is provided under a BSD-style license that can be found in the
+`LICENSE <https://github.com/pybind/pybind11/blob/master/LICENSE>`_
+file. By using, distributing, or contributing to this project, you agree
+to the terms and conditions of this license.
+.. |Latest Documentation Status| image:: https://readthedocs.org/projects/pybind11/badge?version=latest
+   :target: http://pybind11.readthedocs.org/en/latest
+.. |Stable Documentation Status| image:: https://img.shields.io/badge/docs-stable-blue.svg
+   :target: http://pybind11.readthedocs.org/en/stable
+.. |Gitter chat| image:: https://img.shields.io/gitter/room/gitterHQ/gitter.svg
+   :target: https://gitter.im/pybind/Lobby
+.. |CI| image:: https://github.com/pybind/pybind11/workflows/CI/badge.svg
+   :target: https://github.com/pybind/pybind11/actions
+.. |Build status| image:: https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true
+   :target: https://ci.appveyor.com/project/wjakob/pybind11
+.. |PyPI package| image:: https://img.shields.io/pypi/v/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |Conda-forge| image:: https://img.shields.io/conda/vn/conda-forge/pybind11.svg
+   :target: https://github.com/conda-forge/pybind11-feedstock
+.. |Repology| image:: https://repology.org/badge/latest-versions/python:pybind11.svg
+   :target: https://repology.org/project/python:pybind11/versions
+.. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |GitHub Discussions| image:: https://img.shields.io/static/v1?label=Discussions&message=Ask&color=blue&logo=github
+   :target: https://github.com/pybind/pybind11/discussions

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from .modules import *  # noqa: F403
+from .modules.fused import _FusedModule  # noqa: F403
+# # Subpackages
+# from . import qat  # noqa: F403
+# from . import quantized  # noqa: F403
+__all__ = [
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearBn1d',
+    'LinearLeakyReLU',
+    'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
+]
+# We are exposing all subpackages to the end-user.
+# Because of possible inter-dependency, we want to avoid
+# the cyclic imports, thus implementing lazy version
+# as per https://peps.python.org/pep-0562/
+def __getattr__(name):
+    if name in __all__:
+        import importlib
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-311.pyc ADDED Viewed

Binary file (8.46 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-311.pyc ADDED Viewed

Binary file (3.44 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Variables
+from ._mappings import get_dynamic_sparse_quantized_mapping
+from ._mappings import get_static_sparse_quantized_mapping
+# Sparsifier
+from .sparsifier.base_sparsifier import BaseSparsifier
+from .sparsifier.weight_norm_sparsifier import WeightNormSparsifier
+from .sparsifier.nearly_diagonal_sparsifier import NearlyDiagonalSparsifier
+# Scheduler
+from .scheduler.base_scheduler import BaseScheduler
+from .scheduler.lambda_scheduler import LambdaSL
+from .scheduler.cubic_scheduler import CubicSL
+# Parametrizations
+from .sparsifier.utils import FakeSparsity
+from .sparsifier.utils import module_to_fqn
+from .sparsifier.utils import fqn_to_module
+from .sparsifier.utils import get_arg_info_from_tensor_fqn

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (253 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from functools import wraps
+import weakref
+import abc
+import warnings
+from ..data_sparsifier import BaseDataSparsifier
+__all__ = ['BaseDataScheduler']
+class BaseDataScheduler:
+    r"""
+    The BaseDataScheduler is the abstract scheduler class specifically for the
+    BaseDataSparsifier class. This class controls a specific hyperparameter of
+    the sparsifier class and varies it across the training process (or across time).
+    Args:
+        data_sparsifier (instance of BaseDataSparsifier)
+            Implemented class data sparsifier class wherein the update_mask is implemented
+        schedule_param (str)
+            A specific hyperparameter of the passed sparsifier that needs to be scheduled/varied
+        last_epoch (int, default=-1)
+            This is specifically is passed when training needs to be resumed from a particular
+            point.
+        verbose (bool, default=False)
+            Verbosity of the BaseDataScheduler
+    The *get_hyperparam()* function needs to be implemented by the user.
+    """
+    def __init__(self, data_sparsifier, schedule_param: str, last_epoch=-1, verbose=False):
+        # Attach sparsifier
+        if not isinstance(data_sparsifier, BaseDataSparsifier):
+            raise TypeError('{} is not an instance of torch.ao.pruning.BaseDataSparsifier'.format(
+                type(data_sparsifier).__name__))
+        self.data_sparsifier = data_sparsifier
+        self.schedule_param = schedule_param
+        # Initialize epoch and base hyper-params
+        self.base_param = {
+            name: config.get(schedule_param, None)
+            for name, config in self.data_sparsifier.data_groups.items()
+        }
+        self.last_epoch = last_epoch
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `scheduler.step()` is called after
+        # `sparsifier.step()`
+        def with_counter(method):
+            if getattr(method, '_with_counter', False):
+                # `sparsifier.step()` has already been replaced, return.
+                return method
+            # Keep a weak reference to the sparsifier instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)
+            # Get the unbound method for the same purpose.
+            func = method.__func__
+            cls = instance_ref().__class__
+            del method
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._step_count += 1  # type: ignore[union-attr]
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True  # type: ignore[attr-defined]
+            return wrapper
+        self.data_sparsifier.step = with_counter(self.data_sparsifier.step)  # type: ignore[assignment]
+        self.data_sparsifier._step_count = 0  # type: ignore[attr-defined]
+        self._step_count: int = 0
+        self.verbose = verbose
+        # Housekeeping
+        self._get_sp_called_within_step: bool = False  # sp -> schedule parameter
+        self.step()
+    @abc.abstractmethod
+    def get_schedule_param(self):
+        r"""
+        Abstract method that needs to be implemented by the child class.
+        The expected return type should is a dictionary of name to schedule_param value
+        The returned values will be updated in sparsifier when the scheduler step() function
+        is called.
+        Example:
+            >>> def get_schedule_param(self):
+            ...     new_param = {}
+            ...     for name in self.sparsifier.data_groups.keys():
+            ...         new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
+            ...     return new_param
+        When the step() function is called, the value in self.sparsifier.data_groups[name][self.schedule_param]
+        would be halved
+        """
+        raise NotImplementedError
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        format_string += '\n'
+        format_string += f'Data Sparsifier {self.data_sparsifier}\n'
+        format_string += f'    {self.schedule_param}: {self.base_param}\n'
+        format_string += ')'
+        return format_string
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+        It contains an entry for every variable in self.__dict__ which
+        is not the sparsifier.
+        Note:
+            The scheduler class does not track the state of the data_sparsifier.
+            Make sure to store the state of the sparsifier before storing the
+            state of the scheduler
+        """
+        return {key: value for key, value in self.__dict__.items() if key != 'data_sparsifier'}
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+        Note:
+            Remember to restore the state of the data_sparsifier before the scheduler.
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+    def get_last_param(self):
+        return self._last_param
+    def step(self):
+        # Raise warning if trying to call scheduler step before the sparsifier.
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._step_count == 1:
+            if not hasattr(self.data_sparsifier.step, "_with_counter"):
+                warnings.warn("Seems like `data_sparsifier.step()` has been overridden after sparsity scheduler "
+                              "initialization. Please, make sure to call `data_sparsifier.step()` before "
+                              "`scheduler.step()`.", UserWarning)
+            # Just check if there were two first scheduler.step() calls before sparsifier.step()
+            elif self.data_sparsifier._step_count < 1:  # type: ignore[attr-defined]
+                warnings.warn("Detected call of `scheduler.step()` before `data_sparsifier.step()`. "
+                              "You have to make sure you run the data_sparsifier.step() BEFORE any "
+                              "calls to the scheduler.step().", UserWarning)
+        self._step_count += 1
+        class _enable_get_sp_call:
+            def __init__(self, o):
+                self.o = o
+            def __enter__(self):
+                self.o._get_sp_called_within_step = True
+                return self
+            def __exit__(self, type, value, traceback):
+                self.o._get_sp_called_within_step = False
+        with _enable_get_sp_call(self):
+            self.last_epoch += 1
+            updated_scheduler_params = self.get_schedule_param()
+        for name, param in updated_scheduler_params.items():
+            self.data_sparsifier.data_groups[name][self.schedule_param] = param
+            if self.verbose:
+                print(f"Adjusting {self.schedule_param} for group {name} to {param}")
+        self._last_param = {
+            name: config.get(self.schedule_param, None)
+            for name, config in self.data_sparsifier.data_groups.items()
+        }
+        self.data_sparsifier.enable_mask_update = True