diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f41880f9c71951ed384ef0af6f73351c2ea74afe
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f197dc6057a69444de6ec953d91eeda6d5f7043e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3984a063885907141b56bdd2c6e8cc730c592cbb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from . import _Tensor, Tensor
+from .reference import _dims, _enable_layers, llist, ltuple
+
+
+class DelayedMulTensor(_Tensor):
+    def __init__(self, lhs, rhs):
+        self._lhs, self._rhs = lhs, rhs
+        self._data = None
+        self._levels_data = None
+        self._has_device = lhs._has_device or rhs._has_device
+        self._batchtensor_data = None
+        self._tensor_data = None
+
+    @property
+    def _levels(self):
+        if self._levels_data is None:
+            levels = llist(self._lhs._levels)
+            for l in self._rhs._levels:
+                if l not in levels:
+                    levels.append(l)
+            self._levels_data = ltuple(levels)
+        return self._levels_data
+
+    @property
+    def _batchtensor(self):
+        if self._batchtensor_data is None:
+            with _enable_layers(self._levels):
+                print("bt multiply fallback")
+                self._batchtensor_data = self._lhs._batchtensor * self._rhs._batchtensor
+        return self._batchtensor_data
+
+    @property
+    def _tensor(self):
+        if self._tensor_data is None:
+            self._tensor_data = Tensor.from_batched(
+                self._batchtensor, self._has_device
+            )._tensor
+        return self._tensor_data
+
+    @property
+    def ndim(self):
+        return self._batchtensor.ndim
+
+    @property
+    def dims(self):
+        return ltuple(super().dims)
+
+    def sum(self, dim):
+        dims = _dims(dim, 0, False, False)
+        n = ord("a")
+        all_levels = self._levels
+
+        def to_char(d):
+            return chr(n + all_levels.index(d))
+
+        plhs, levelslhs = self._lhs._tensor, self._lhs._levels
+        prhs, levelsrhs = self._rhs._tensor, self._rhs._levels
+        new_dims = tuple(d for d in self.dims if d not in dims)
+        new_levels = [l for l in self._levels if l not in dims]
+        fmt = "".join(
+            [
+                *(to_char(d) for d in levelslhs),
+                ",",
+                *(to_char(d) for d in levelsrhs),
+                "->",
+                *(to_char(d) for d in new_levels),
+            ]
+        )
+        result_data = torch.einsum(fmt, (plhs, prhs))
+        return Tensor.from_positional(result_data, new_levels, True)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c962a898ca79cfe3d8af7432aacc3802d4f4ade
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import signal
+import subprocess
+from contextlib import contextmanager
+
+
+@contextmanager
+def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
+    pid = os.getpid()
+    if not os.path.exists(magic_trace_cache):
+        print(f"Downloading magic_trace to: {magic_trace_cache}")
+        subprocess.run(
+            [
+                "wget",
+                "-O",
+                magic_trace_cache,
+                "-q",
+                "https://github.com/janestreet/magic-trace/releases/download/v1.0.2/magic-trace",
+            ]
+        )
+        subprocess.run(["chmod", "+x", magic_trace_cache])
+    args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output]
+    p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8")
+    while True:
+        x = p.stderr.readline()
+        print(x)
+        if "Attached" in x:
+            break
+    try:
+        yield
+    finally:
+        p.send_signal(signal.SIGINT)
+        r = p.wait()
+        print(p.stderr.read())
+        p.stderr.close()
+        if r != 0:
+            raise ValueError(f"magic_trace exited abnormally: {r}")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2146c4a21a144dc3942e304d1406ace47df0e57
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from types import (
+    BuiltinMethodType,
+    FunctionType,
+    GetSetDescriptorType,
+    MethodDescriptorType,
+    WrapperDescriptorType,
+)
+
+from functorch._C import dim as _C
+
+_wrap_method = _C._wrap_method
+
+FUNC_TYPES = (
+    FunctionType,
+    MethodDescriptorType,
+    BuiltinMethodType,
+    WrapperDescriptorType,
+)
+PROPERTY_TYPES = (GetSetDescriptorType, property)
+
+
+def _py_wrap_method(orig, __torch_function__):
+    def impl(*args, **kwargs):
+        return __torch_function__(orig, None, args, kwargs)
+
+    return impl
+
+
+def wrap_type(use_c, to_patch, pattern, __torch_function__):
+    if use_c:
+        wrap_method = _wrap_method
+    else:
+        wrap_method = _py_wrap_method
+
+    all = {}
+    for t in reversed(pattern.mro()[:-1]):  # skip object
+        all.update(t.__dict__)
+
+    def wrap_attr(orig):
+        return property(wrap_method(orig.__get__, __torch_function__))
+
+    for name, obj in all.items():
+        if name in (
+            "__dict__",
+            "__new__",
+            "__init__",
+            "__repr__",
+            "__weakref__",
+            "__doc__",
+            "__module__",
+            "__dir__",
+        ):
+            continue
+
+        # skip things that have been overloaded
+        # things that come from object like `__eq__` still need to be patched, however.
+        if hasattr(to_patch, name) and getattr(to_patch, name) is not getattr(
+            object, name, None
+        ):
+            continue
+
+        if isinstance(obj, FUNC_TYPES):
+            setattr(to_patch, name, wrap_method(obj, __torch_function__))
+        elif isinstance(obj, PROPERTY_TYPES):
+            setattr(to_patch, name, wrap_attr(obj))
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32751d6e2493ab6a81f5a7f91a572553201f466
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py
@@ -0,0 +1,3 @@
+from .rearrange import rearrange
+
+__all__ = ["rearrange"]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a227ef9b0bb7d412d1932e16f8e47a44dc7dec
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7efe8a4d2413cea7c15db76c14c384b902eb6966
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a88a12cfd1141217b8f1aa03cef5d171ea04368
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py
new file mode 100644
index 0000000000000000000000000000000000000000..0449bb7ed2c72ef68f966f253c99e8570dfbd7ef
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py
@@ -0,0 +1,207 @@
+from __future__ import annotations
+
+import functools
+from typing import Callable, Dict, List, Sequence, Tuple, Union
+
+import torch
+
+from functorch._C import dim as _C
+from ._parsing import (
+    _ellipsis,
+    AnonymousAxis,
+    comma_separate,
+    parse_pattern,
+    validate_rearrange_expressions,
+)
+
+__all__ = ["rearrange"]
+
+dims = _C.dims
+
+
+@functools.lru_cache(256)
+def _create_rearrange_callable(
+    tensor_ndim: int, pattern: str, **axes_lengths: int
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    r"""Translate an `einops`-style pattern into a callable that performs the rearrange using first-class dimensions.
+
+    Since the an equivalent result is computed for tensors with the same number of dimensions, with the same pattern and
+    specified axes lengths, this function can be memoized.
+
+    Args:
+        tensor_ndim (int): the number of dimensions in the tensor to rearrange
+        pattern (str): the `einops`-style rearrangement pattern
+        axes_lengths (int): any additional length specifications for dimensions
+
+    Returns:
+        Callable[[torch.Tensor], torch.Tensor]: a callable that performs the rearrangement
+    """
+    left, right = parse_pattern(pattern, axes_lengths)
+    validate_rearrange_expressions(left, right, axes_lengths)
+
+    n_anon_dims = sum(not dim for dim in left.composition)
+    if left.has_ellipsis:
+        n_ellipsis_dims = tensor_ndim - (len(left.composition) - 1)
+        n_named_dims = len(left.identifiers) - 1
+
+        if (pattern_ndim := n_anon_dims + n_named_dims) > tensor_ndim:
+            raise ValueError(
+                f"Number of dimensions in pattern ({pattern_ndim}) must be less than or equal to the number of "
+                f"dimensions in the tensor ({tensor_ndim})"
+            )
+    else:
+        n_ellipsis_dims = 0
+        n_named_dims = len(left.identifiers)
+
+        if (pattern_ndim := len(left.composition)) != tensor_ndim:
+            raise ValueError(
+                f"Number of dimensions in pattern ({pattern_ndim}) must be equal to the number of dimensions in "
+                f"the tensor ({tensor_ndim})"
+            )
+    n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims
+
+    if n_dims == 0:
+        # an identity rearrangement on a 0-dimension tensor
+        return lambda tensor: tensor
+
+    first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims))
+    identifier_dim_map: Dict[Union[str, AnonymousAxis], Tuple[str, ...]] = {}
+    anon_axes: List[AnonymousAxis] = []
+
+    # map the left-hand side identifiers to strings representing first class dims
+    dims_i = 0
+    for dimension in left.composition:
+        if isinstance(dimension, list):
+            for identifier in dimension:
+                # non-unitary anon axes are not allowed in rearrange & unitary anon axes are represented as empty lists
+                assert isinstance(identifier, str)
+                identifier_dim_map[identifier] = (first_class_dims[dims_i],)
+                dims_i += 1
+            if not dimension:
+                # unitary anonymous axis
+                anon_axis = AnonymousAxis("1")
+                identifier_dim_map[anon_axis] = (first_class_dims[dims_i],)
+                anon_axes.append(anon_axis)
+                dimension.append(anon_axis)
+                dims_i += 1
+        elif dimension == _ellipsis:
+            identifier = _ellipsis
+            identifier_dim_map[identifier] = tuple(
+                first_class_dims[dims_i + j] for j in range(n_ellipsis_dims)
+            )
+            dims_i += n_ellipsis_dims
+        else:
+            raise ValueError(f"Unexpected dimension: {dimension}")
+
+    def composition_to_dims(
+        composition: Sequence[Union[List[Union[str, AnonymousAxis]], str]]
+    ) -> List[Union[str, Tuple[str, ...]]]:
+        """Convert a `ParsedExpression.composition` into a `Tensor.__getitem__` index of strings representing first
+        class dims."""
+        dim_composition: List[Union[str, Tuple[str, ...]]] = []
+        for dimension in composition:
+            if isinstance(dimension, list):
+                dim_composition.append(
+                    tuple(
+                        dim
+                        for identifier in dimension
+                        for dim in identifier_dim_map[identifier]
+                    )
+                )
+            elif dimension == _ellipsis:
+                dim_composition.extend(identifier_dim_map[_ellipsis])
+            else:
+                raise ValueError(f"Unexpected dimension: {dimension}")
+        return dim_composition
+
+    left_dims = composition_to_dims(left.composition)
+    right_dims = composition_to_dims(right.composition)
+    anon_dims = tuple(identifier_dim_map[axis][0] for axis in anon_axes)
+    specified_lengths = tuple(
+        (identifier_dim_map[axis][0], length) for axis, length in axes_lengths.items()
+    )
+
+    custom_rearrange_callable_name = "do_rearrange"
+    custom_rearrange_callable_code = (
+        (
+            f"def {custom_rearrange_callable_name}(tensor):\n"
+            f"    {comma_separate(first_class_dims)} = dims({n_dims})\n"
+        )
+        + (
+            "".join(
+                f"    {dim}.size = {length}\n" for (dim, length) in specified_lengths
+            )
+            if specified_lengths
+            else ""
+        )
+        + f"    tensor = tensor[{comma_separate(left_dims)}].order({comma_separate(right_dims)})\n"
+        + (
+            f"    return tensor.sum({comma_separate([anon_dims])}, keepdim=False)\n"
+            if anon_dims
+            else "    return tensor\n"
+        )
+    )
+
+    exec(custom_rearrange_callable_code)
+    return locals()[custom_rearrange_callable_name]
+
+
+def rearrange(
+    tensor: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor, ...]],
+    pattern: str,
+    **axes_lengths: int,
+) -> torch.Tensor:
+    r"""A native implementation of `einops.rearrange`, a reader-friendly smart element reordering for multidimensional
+    tensors. This operation includes functionality of transpose (axes permutation), reshape (view), squeeze, unsqueeze,
+    stack, concatenate and other operations.
+
+    See: https://einops.rocks/api/rearrange/
+
+    Args:
+        tensor (Tensor or sequence of Tensor): the tensor(s) to rearrange
+        pattern (str): the rearrangement pattern
+        axes_lengths (int): any additional length specifications for dimensions
+
+    Returns:
+        Tensor: the rearranged tensor
+
+    Examples:
+        >>> # suppose we have a set of 32 images in "h w c" format (height-width-channel)
+        >>> images = torch.randn((32, 30, 40, 3))
+
+        >>> # stack along first (batch) axis, output is a single array
+        >>> rearrange(images, 'b h w c -> b h w c').shape
+        torch.Size([32, 30, 40, 3])
+
+        >>> # concatenate images along height (vertical axis), 960 = 32 * 30
+        >>> rearrange(images, 'b h w c -> (b h) w c').shape
+        torch.Size([960, 40, 3])
+
+        >>> # concatenated images along horizontal axis, 1280 = 32 * 40
+        >>> rearrange(images, 'b h w c -> h (b w) c').shape
+        torch.Size([30, 1280, 3])
+
+        >>> # reordered axes to "b c h w" format for deep learning
+        >>> rearrange(images, 'b h w c -> b c h w').shape
+        torch.Size([32, 3, 30, 40])
+
+        >>> # flattened each image into a vector, 3600 = 30 * 40 * 3
+        >>> rearrange(images, 'b h w c -> b (c h w)').shape
+        torch.Size([32, 3600])
+
+        >>> # split each image into 4 smaller (top-left, top-right, bottom-left, bottom-right), 128 = 32 * 2 * 2
+        >>> rearrange(images, 'b (h1 h) (w1 w) c -> (b h1 w1) h w c', h1=2, w1=2).shape
+        torch.Size([128, 15, 20, 3])
+
+        >>> # space-to-depth operation
+        >>> rearrange(images, 'b (h h1) (w w1) c -> b h w (c h1 w1)', h1=2, w1=2).shape
+        torch.Size([32, 15, 20, 12])
+    """
+    if not isinstance(tensor, torch.Tensor):
+        tensor = torch.stack(tensor)
+
+    rearrange_callable = _create_rearrange_callable(
+        tensor.ndim, pattern, **axes_lengths
+    )
+
+    return rearrange_callable(tensor)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5986013fa6cd921cc602202135dcd26b80a075
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbfb721a5d758f1532e0b2d45dd4497651bfff74
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca5cc598fb53491cb6ae4a41a40477c58544d53
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py
@@ -0,0 +1,246 @@
+from ..libmp.backend import xrange
+from .calculus import defun
+
+#----------------------------------------------------------------------------#
+#                              Approximation methods                         #
+#----------------------------------------------------------------------------#
+
+# The Chebyshev approximation formula is given at:
+# http://mathworld.wolfram.com/ChebyshevApproximationFormula.html
+
+# The only major changes in the following code is that we return the
+# expanded polynomial coefficients instead of Chebyshev coefficients,
+# and that we automatically transform [a,b] -> [-1,1] and back
+# for convenience.
+
+# Coefficient in Chebyshev approximation
+def chebcoeff(ctx,f,a,b,j,N):
+    s = ctx.mpf(0)
+    h = ctx.mpf(0.5)
+    for k in range(1, N+1):
+        t = ctx.cospi((k-h)/N)
+        s += f(t*(b-a)*h + (b+a)*h) * ctx.cospi(j*(k-h)/N)
+    return 2*s/N
+
+# Generate Chebyshev polynomials T_n(ax+b) in expanded form
+def chebT(ctx, a=1, b=0):
+    Tb = [1]
+    yield Tb
+    Ta = [b, a]
+    while 1:
+        yield Ta
+        # Recurrence: T[n+1](ax+b) = 2*(ax+b)*T[n](ax+b) - T[n-1](ax+b)
+        Tmp = [0] + [2*a*t for t in Ta]
+        for i, c in enumerate(Ta): Tmp[i] += 2*b*c
+        for i, c in enumerate(Tb): Tmp[i] -= c
+        Ta, Tb = Tmp, Ta
+
+@defun
+def chebyfit(ctx, f, interval, N, error=False):
+    r"""
+    Computes a polynomial of degree `N-1` that approximates the
+    given function `f` on the interval `[a, b]`. With ``error=True``,
+    :func:`~mpmath.chebyfit` also returns an accurate estimate of the
+    maximum absolute error; that is, the maximum value of
+    `|f(x) - P(x)|` for `x \in [a, b]`.
+
+    :func:`~mpmath.chebyfit` uses the Chebyshev approximation formula,
+    which gives a nearly optimal solution: that is, the maximum
+    error of the approximating polynomial is very close to
+    the smallest possible for any polynomial of the same degree.
+
+    Chebyshev approximation is very useful if one needs repeated
+    evaluation of an expensive function, such as function defined
+    implicitly by an integral or a differential equation. (For
+    example, it could be used to turn a slow mpmath function
+    into a fast machine-precision version of the same.)
+
+    **Examples**
+
+    Here we use :func:`~mpmath.chebyfit` to generate a low-degree approximation
+    of `f(x) = \cos(x)`, valid on the interval `[1, 2]`::
+
+        >>> from mpmath import *
+        >>> mp.dps = 15; mp.pretty = True
+        >>> poly, err = chebyfit(cos, [1, 2], 5, error=True)
+        >>> nprint(poly)
+        [0.00291682, 0.146166, -0.732491, 0.174141, 0.949553]
+        >>> nprint(err, 12)
+        1.61351758081e-5
+
+    The polynomial can be evaluated using ``polyval``::
+
+        >>> nprint(polyval(poly, 1.6), 12)
+        -0.0291858904138
+        >>> nprint(cos(1.6), 12)
+        -0.0291995223013
+
+    Sampling the true error at 1000 points shows that the error
+    estimate generated by ``chebyfit`` is remarkably good::
+
+        >>> error = lambda x: abs(cos(x) - polyval(poly, x))
+        >>> nprint(max([error(1+n/1000.) for n in range(1000)]), 12)
+        1.61349954245e-5
+
+    **Choice of degree**
+
+    The degree `N` can be set arbitrarily high, to obtain an
+    arbitrarily good approximation. As a rule of thumb, an
+    `N`-term Chebyshev approximation is good to `N/(b-a)` decimal
+    places on a unit interval (although this depends on how
+    well-behaved `f` is). The cost grows accordingly: ``chebyfit``
+    evaluates the function `(N^2)/2` times to compute the
+    coefficients and an additional `N` times to estimate the error.
+
+    **Possible issues**
+
+    One should be careful to use a sufficiently high working
+    precision both when calling ``chebyfit`` and when evaluating
+    the resulting polynomial, as the polynomial is sometimes
+    ill-conditioned. It is for example difficult to reach
+    15-digit accuracy when evaluating the polynomial using
+    machine precision floats, no matter the theoretical
+    accuracy of the polynomial. (The option to return the
+    coefficients in Chebyshev form should be made available
+    in the future.)
+
+    It is important to note the Chebyshev approximation works
+    poorly if `f` is not smooth. A function containing singularities,
+    rapid oscillation, etc can be approximated more effectively by
+    multiplying it by a weight function that cancels out the
+    nonsmooth features, or by dividing the interval into several
+    segments.
+    """
+    a, b = ctx._as_points(interval)
+    orig = ctx.prec
+    try:
+        ctx.prec = orig + int(N**0.5) + 20
+        c = [chebcoeff(ctx,f,a,b,k,N) for k in range(N)]
+        d = [ctx.zero] * N
+        d[0] = -c[0]/2
+        h = ctx.mpf(0.5)
+        T = chebT(ctx, ctx.mpf(2)/(b-a), ctx.mpf(-1)*(b+a)/(b-a))
+        for (k, Tk) in zip(range(N), T):
+            for i in range(len(Tk)):
+                d[i] += c[k]*Tk[i]
+        d = d[::-1]
+        # Estimate maximum error
+        err = ctx.zero
+        for k in range(N):
+            x = ctx.cos(ctx.pi*k/N) * (b-a)*h + (b+a)*h
+            err = max(err, abs(f(x) - ctx.polyval(d, x)))
+    finally:
+        ctx.prec = orig
+    if error:
+        return d, +err
+    else:
+        return d
+
+@defun
+def fourier(ctx, f, interval, N):
+    r"""
+    Computes the Fourier series of degree `N` of the given function
+    on the interval `[a, b]`. More precisely, :func:`~mpmath.fourier` returns
+    two lists `(c, s)` of coefficients (the cosine series and sine
+    series, respectively), such that
+
+    .. math ::
+
+        f(x) \sim \sum_{k=0}^N
+            c_k \cos(k m x) + s_k \sin(k m x)
+
+    where `m = 2 \pi / (b-a)`.
+
+    Note that many texts define the first coefficient as `2 c_0` instead
+    of `c_0`. The easiest way to evaluate the computed series correctly
+    is to pass it to :func:`~mpmath.fourierval`.
+
+    **Examples**
+
+    The function `f(x) = x` has a simple Fourier series on the standard
+    interval `[-\pi, \pi]`. The cosine coefficients are all zero (because
+    the function has odd symmetry), and the sine coefficients are
+    rational numbers::
+
+        >>> from mpmath import *
+        >>> mp.dps = 15; mp.pretty = True
+        >>> c, s = fourier(lambda x: x, [-pi, pi], 5)
+        >>> nprint(c)
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        >>> nprint(s)
+        [0.0, 2.0, -1.0, 0.666667, -0.5, 0.4]
+
+    This computes a Fourier series of a nonsymmetric function on
+    a nonstandard interval::
+
+        >>> I = [-1, 1.5]
+        >>> f = lambda x: x**2 - 4*x + 1
+        >>> cs = fourier(f, I, 4)
+        >>> nprint(cs[0])
+        [0.583333, 1.12479, -1.27552, 0.904708, -0.441296]
+        >>> nprint(cs[1])
+        [0.0, -2.6255, 0.580905, 0.219974, -0.540057]
+
+    It is instructive to plot a function along with its truncated
+    Fourier series::
+
+        >>> plot([f, lambda x: fourierval(cs, I, x)], I) #doctest: +SKIP
+
+    Fourier series generally converge slowly (and may not converge
+    pointwise). For example, if `f(x) = \cosh(x)`, a 10-term Fourier
+    series gives an `L^2` error corresponding to 2-digit accuracy::
+
+        >>> I = [-1, 1]
+        >>> cs = fourier(cosh, I, 9)
+        >>> g = lambda x: (cosh(x) - fourierval(cs, I, x))**2
+        >>> nprint(sqrt(quad(g, I)))
+        0.00467963
+
+    :func:`~mpmath.fourier` uses numerical quadrature. For nonsmooth functions,
+    the accuracy (and speed) can be improved by including all singular
+    points in the interval specification::
+
+        >>> nprint(fourier(abs, [-1, 1], 0), 10)
+        ([0.5000441648], [0.0])
+        >>> nprint(fourier(abs, [-1, 0, 1], 0), 10)
+        ([0.5], [0.0])
+
+    """
+    interval = ctx._as_points(interval)
+    a = interval[0]
+    b = interval[-1]
+    L = b-a
+    cos_series = []
+    sin_series = []
+    cutoff = ctx.eps*10
+    for n in xrange(N+1):
+        m = 2*n*ctx.pi/L
+        an = 2*ctx.quadgl(lambda t: f(t)*ctx.cos(m*t), interval)/L
+        bn = 2*ctx.quadgl(lambda t: f(t)*ctx.sin(m*t), interval)/L
+        if n == 0:
+            an /= 2
+        if abs(an) < cutoff: an = ctx.zero
+        if abs(bn) < cutoff: bn = ctx.zero
+        cos_series.append(an)
+        sin_series.append(bn)
+    return cos_series, sin_series
+
+@defun
+def fourierval(ctx, series, interval, x):
+    """
+    Evaluates a Fourier series (in the format computed by
+    by :func:`~mpmath.fourier` for the given interval) at the point `x`.
+
+    The series should be a pair `(c, s)` where `c` is the
+    cosine series and `s` is the sine series. The two lists
+    need not have the same length.
+    """
+    cs, ss = series
+    ab = ctx._as_points(interval)
+    a = interval[0]
+    b = interval[-1]
+    m = 2*ctx.pi/(ab[-1]-ab[0])
+    s = ctx.zero
+    s += ctx.fsum(cs[n]*ctx.cos(m*n*x) for n in xrange(len(cs)) if cs[n])
+    s += ctx.fsum(ss[n]*ctx.sin(m*n*x) for n in xrange(len(ss)) if ss[n])
+    return s
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..293697b9fcf8bd82d58ac4ff45acd73fadac82f9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py
@@ -0,0 +1,2 @@
+from . import eigen           # to set methods
+from . import eigen_symmetric # to set methods
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b74f0998c300da19fe3afa7884007e82feb24eed
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fae2a7a9a29898241ed41810331b480ff70798f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py
@@ -0,0 +1,531 @@
+from ..libmp.backend import xrange
+
+# TODO: should use diagonalization-based algorithms
+
+class MatrixCalculusMethods(object):
+
+    def _exp_pade(ctx, a):
+        """
+        Exponential of a matrix using Pade approximants.
+
+        See G. H. Golub, C. F. van Loan 'Matrix Computations',
+        third Ed., page 572
+
+        TODO:
+         - find a good estimate for q
+         - reduce the number of matrix multiplications to improve
+           performance
+        """
+        def eps_pade(p):
+            return ctx.mpf(2)**(3-2*p) * \
+                ctx.factorial(p)**2/(ctx.factorial(2*p)**2 * (2*p + 1))
+        q = 4
+        extraq = 8
+        while 1:
+            if eps_pade(q) < ctx.eps:
+                break
+            q += 1
+        q += extraq
+        j = int(max(1, ctx.mag(ctx.mnorm(a,'inf'))))
+        extra = q
+        prec = ctx.prec
+        ctx.dps += extra + 3
+        try:
+            a = a/2**j
+            na = a.rows
+            den = ctx.eye(na)
+            num = ctx.eye(na)
+            x = ctx.eye(na)
+            c = ctx.mpf(1)
+            for k in range(1, q+1):
+                c *= ctx.mpf(q - k + 1)/((2*q - k + 1) * k)
+                x = a*x
+                cx = c*x
+                num += cx
+                den += (-1)**k * cx
+            f = ctx.lu_solve_mat(den, num)
+            for k in range(j):
+                f = f*f
+        finally:
+            ctx.prec = prec
+        return f*1
+
+    def expm(ctx, A, method='taylor'):
+        r"""
+        Computes the matrix exponential of a square matrix `A`, which is defined
+        by the power series
+
+        .. math ::
+
+            \exp(A) = I + A + \frac{A^2}{2!} + \frac{A^3}{3!} + \ldots
+
+        With method='taylor', the matrix exponential is computed
+        using the Taylor series. With method='pade', Pade approximants
+        are used instead.
+
+        **Examples**
+
+        Basic examples::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> expm(zeros(3))
+            [1.0  0.0  0.0]
+            [0.0  1.0  0.0]
+            [0.0  0.0  1.0]
+            >>> expm(eye(3))
+            [2.71828182845905               0.0               0.0]
+            [             0.0  2.71828182845905               0.0]
+            [             0.0               0.0  2.71828182845905]
+            >>> expm([[1,1,0],[1,0,1],[0,1,0]])
+            [ 3.86814500615414  2.26812870852145  0.841130841230196]
+            [ 2.26812870852145  2.44114713886289   1.42699786729125]
+            [0.841130841230196  1.42699786729125    1.6000162976327]
+            >>> expm([[1,1,0],[1,0,1],[0,1,0]], method='pade')
+            [ 3.86814500615414  2.26812870852145  0.841130841230196]
+            [ 2.26812870852145  2.44114713886289   1.42699786729125]
+            [0.841130841230196  1.42699786729125    1.6000162976327]
+            >>> expm([[1+j, 0], [1+j,1]])
+            [(1.46869393991589 + 2.28735528717884j)                        0.0]
+            [  (1.03776739863568 + 3.536943175722j)  (2.71828182845905 + 0.0j)]
+
+        Matrices with large entries are allowed::
+
+            >>> expm(matrix([[1,2],[2,3]])**25)
+            [5.65024064048415e+2050488462815550  9.14228140091932e+2050488462815550]
+            [9.14228140091932e+2050488462815550  1.47925220414035e+2050488462815551]
+
+        The identity `\exp(A+B) = \exp(A) \exp(B)` does not hold for
+        noncommuting matrices::
+
+            >>> A = hilbert(3)
+            >>> B = A + eye(3)
+            >>> chop(mnorm(A*B - B*A))
+            0.0
+            >>> chop(mnorm(expm(A+B) - expm(A)*expm(B)))
+            0.0
+            >>> B = A + ones(3)
+            >>> mnorm(A*B - B*A)
+            1.8
+            >>> mnorm(expm(A+B) - expm(A)*expm(B))
+            42.0927851137247
+
+        """
+        if method == 'pade':
+            prec = ctx.prec
+            try:
+                A = ctx.matrix(A)
+                ctx.prec += 2*A.rows
+                res = ctx._exp_pade(A)
+            finally:
+                ctx.prec = prec
+            return res
+        A = ctx.matrix(A)
+        prec = ctx.prec
+        j = int(max(1, ctx.mag(ctx.mnorm(A,'inf'))))
+        j += int(0.5*prec**0.5)
+        try:
+            ctx.prec += 10 + 2*j
+            tol = +ctx.eps
+            A = A/2**j
+            T = A
+            Y = A**0 + A
+            k = 2
+            while 1:
+                T *= A * (1/ctx.mpf(k))
+                if ctx.mnorm(T, 'inf') < tol:
+                    break
+                Y += T
+                k += 1
+            for k in xrange(j):
+                Y = Y*Y
+        finally:
+            ctx.prec = prec
+        Y *= 1
+        return Y
+
+    def cosm(ctx, A):
+        r"""
+        Gives the cosine of a square matrix `A`, defined in analogy
+        with the matrix exponential.
+
+        Examples::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> X = eye(3)
+            >>> cosm(X)
+            [0.54030230586814               0.0               0.0]
+            [             0.0  0.54030230586814               0.0]
+            [             0.0               0.0  0.54030230586814]
+            >>> X = hilbert(3)
+            >>> cosm(X)
+            [ 0.424403834569555  -0.316643413047167  -0.221474945949293]
+            [-0.316643413047167   0.820646708837824  -0.127183694770039]
+            [-0.221474945949293  -0.127183694770039   0.909236687217541]
+            >>> X = matrix([[1+j,-2],[0,-j]])
+            >>> cosm(X)
+            [(0.833730025131149 - 0.988897705762865j)  (1.07485840848393 - 0.17192140544213j)]
+            [                                     0.0               (1.54308063481524 + 0.0j)]
+        """
+        B = 0.5 * (ctx.expm(A*ctx.j) + ctx.expm(A*(-ctx.j)))
+        if not sum(A.apply(ctx.im).apply(abs)):
+            B = B.apply(ctx.re)
+        return B
+
+    def sinm(ctx, A):
+        r"""
+        Gives the sine of a square matrix `A`, defined in analogy
+        with the matrix exponential.
+
+        Examples::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> X = eye(3)
+            >>> sinm(X)
+            [0.841470984807897                0.0                0.0]
+            [              0.0  0.841470984807897                0.0]
+            [              0.0                0.0  0.841470984807897]
+            >>> X = hilbert(3)
+            >>> sinm(X)
+            [0.711608512150994  0.339783913247439  0.220742837314741]
+            [0.339783913247439  0.244113865695532  0.187231271174372]
+            [0.220742837314741  0.187231271174372  0.155816730769635]
+            >>> X = matrix([[1+j,-2],[0,-j]])
+            >>> sinm(X)
+            [(1.29845758141598 + 0.634963914784736j)  (-1.96751511930922 + 0.314700021761367j)]
+            [                                    0.0                  (0.0 - 1.1752011936438j)]
+        """
+        B = (-0.5j) * (ctx.expm(A*ctx.j) - ctx.expm(A*(-ctx.j)))
+        if not sum(A.apply(ctx.im).apply(abs)):
+            B = B.apply(ctx.re)
+        return B
+
+    def _sqrtm_rot(ctx, A, _may_rotate):
+        # If the iteration fails to converge, cheat by performing
+        # a rotation by a complex number
+        u = ctx.j**0.3
+        return ctx.sqrtm(u*A, _may_rotate) / ctx.sqrt(u)
+
+    def sqrtm(ctx, A, _may_rotate=2):
+        r"""
+        Computes a square root of the square matrix `A`, i.e. returns
+        a matrix `B = A^{1/2}` such that `B^2 = A`. The square root
+        of a matrix, if it exists, is not unique.
+
+        **Examples**
+
+        Square roots of some simple matrices::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> sqrtm([[1,0], [0,1]])
+            [1.0  0.0]
+            [0.0  1.0]
+            >>> sqrtm([[0,0], [0,0]])
+            [0.0  0.0]
+            [0.0  0.0]
+            >>> sqrtm([[2,0],[0,1]])
+            [1.4142135623731  0.0]
+            [            0.0  1.0]
+            >>> sqrtm([[1,1],[1,0]])
+            [ (0.920442065259926 - 0.21728689675164j)  (0.568864481005783 + 0.351577584254143j)]
+            [(0.568864481005783 + 0.351577584254143j)  (0.351577584254143 - 0.568864481005783j)]
+            >>> sqrtm([[1,0],[0,1]])
+            [1.0  0.0]
+            [0.0  1.0]
+            >>> sqrtm([[-1,0],[0,1]])
+            [(0.0 - 1.0j)           0.0]
+            [         0.0  (1.0 + 0.0j)]
+            >>> sqrtm([[j,0],[0,j]])
+            [(0.707106781186547 + 0.707106781186547j)                                       0.0]
+            [                                     0.0  (0.707106781186547 + 0.707106781186547j)]
+
+        A square root of a rotation matrix, giving the corresponding
+        half-angle rotation matrix::
+
+            >>> t1 = 0.75
+            >>> t2 = t1 * 0.5
+            >>> A1 = matrix([[cos(t1), -sin(t1)], [sin(t1), cos(t1)]])
+            >>> A2 = matrix([[cos(t2), -sin(t2)], [sin(t2), cos(t2)]])
+            >>> sqrtm(A1)
+            [0.930507621912314  -0.366272529086048]
+            [0.366272529086048   0.930507621912314]
+            >>> A2
+            [0.930507621912314  -0.366272529086048]
+            [0.366272529086048   0.930507621912314]
+
+        The identity `(A^2)^{1/2} = A` does not necessarily hold::
+
+            >>> A = matrix([[4,1,4],[7,8,9],[10,2,11]])
+            >>> sqrtm(A**2)
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+            >>> sqrtm(A)**2
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+            >>> A = matrix([[-4,1,4],[7,-8,9],[10,2,11]])
+            >>> sqrtm(A**2)
+            [  7.43715112194995  -0.324127569985474   1.8481718827526]
+            [-0.251549715716942    9.32699765900402  2.48221180985147]
+            [  4.11609388833616   0.775751877098258   13.017955697342]
+            >>> chop(sqrtm(A)**2)
+            [-4.0   1.0   4.0]
+            [ 7.0  -8.0   9.0]
+            [10.0   2.0  11.0]
+
+        For some matrices, a square root does not exist::
+
+            >>> sqrtm([[0,1], [0,0]])
+            Traceback (most recent call last):
+              ...
+            ZeroDivisionError: matrix is numerically singular
+
+        Two examples from the documentation for Matlab's ``sqrtm``::
+
+            >>> mp.dps = 15; mp.pretty = True
+            >>> sqrtm([[7,10],[15,22]])
+            [1.56669890360128  1.74077655955698]
+            [2.61116483933547  4.17786374293675]
+            >>>
+            >>> X = matrix(\
+            ...   [[5,-4,1,0,0],
+            ...   [-4,6,-4,1,0],
+            ...   [1,-4,6,-4,1],
+            ...   [0,1,-4,6,-4],
+            ...   [0,0,1,-4,5]])
+            >>> Y = matrix(\
+            ...   [[2,-1,-0,-0,-0],
+            ...   [-1,2,-1,0,-0],
+            ...   [0,-1,2,-1,0],
+            ...   [-0,0,-1,2,-1],
+            ...   [-0,-0,-0,-1,2]])
+            >>> mnorm(sqrtm(X) - Y)
+            4.53155328326114e-19
+
+        """
+        A = ctx.matrix(A)
+        # Trivial
+        if A*0 == A:
+            return A
+        prec = ctx.prec
+        if _may_rotate:
+            d = ctx.det(A)
+            if abs(ctx.im(d)) < 16*ctx.eps and ctx.re(d) < 0:
+                return ctx._sqrtm_rot(A, _may_rotate-1)
+        try:
+            ctx.prec += 10
+            tol = ctx.eps * 128
+            Y = A
+            Z = I = A**0
+            k = 0
+            # Denman-Beavers iteration
+            while 1:
+                Yprev = Y
+                try:
+                    Y, Z = 0.5*(Y+ctx.inverse(Z)), 0.5*(Z+ctx.inverse(Y))
+                except ZeroDivisionError:
+                    if _may_rotate:
+                        Y = ctx._sqrtm_rot(A, _may_rotate-1)
+                        break
+                    else:
+                        raise
+                mag1 = ctx.mnorm(Y-Yprev, 'inf')
+                mag2 = ctx.mnorm(Y, 'inf')
+                if mag1 <= mag2*tol:
+                    break
+                if _may_rotate and k > 6 and not mag1 < mag2 * 0.001:
+                    return ctx._sqrtm_rot(A, _may_rotate-1)
+                k += 1
+                if k > ctx.prec:
+                    raise ctx.NoConvergence
+        finally:
+            ctx.prec = prec
+        Y *= 1
+        return Y
+
+    def logm(ctx, A):
+        r"""
+        Computes a logarithm of the square matrix `A`, i.e. returns
+        a matrix `B = \log(A)` such that `\exp(B) = A`. The logarithm
+        of a matrix, if it exists, is not unique.
+
+        **Examples**
+
+        Logarithms of some simple matrices::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> X = eye(3)
+            >>> logm(X)
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+            >>> logm(2*X)
+            [0.693147180559945                0.0                0.0]
+            [              0.0  0.693147180559945                0.0]
+            [              0.0                0.0  0.693147180559945]
+            >>> logm(expm(X))
+            [1.0  0.0  0.0]
+            [0.0  1.0  0.0]
+            [0.0  0.0  1.0]
+
+        A logarithm of a complex matrix::
+
+            >>> X = matrix([[2+j, 1, 3], [1-j, 1-2*j, 1], [-4, -5, j]])
+            >>> B = logm(X)
+            >>> nprint(B)
+            [ (0.808757 + 0.107759j)    (2.20752 + 0.202762j)   (1.07376 - 0.773874j)]
+            [ (0.905709 - 0.107795j)  (0.0287395 - 0.824993j)  (0.111619 + 0.514272j)]
+            [(-0.930151 + 0.399512j)   (-2.06266 - 0.674397j)  (0.791552 + 0.519839j)]
+            >>> chop(expm(B))
+            [(2.0 + 1.0j)           1.0           3.0]
+            [(1.0 - 1.0j)  (1.0 - 2.0j)           1.0]
+            [        -4.0          -5.0  (0.0 + 1.0j)]
+
+        A matrix `X` close to the identity matrix, for which
+        `\log(\exp(X)) = \exp(\log(X)) = X` holds::
+
+            >>> X = eye(3) + hilbert(3)/4
+            >>> X
+            [              1.25             0.125  0.0833333333333333]
+            [             0.125  1.08333333333333              0.0625]
+            [0.0833333333333333            0.0625                1.05]
+            >>> logm(expm(X))
+            [              1.25             0.125  0.0833333333333333]
+            [             0.125  1.08333333333333              0.0625]
+            [0.0833333333333333            0.0625                1.05]
+            >>> expm(logm(X))
+            [              1.25             0.125  0.0833333333333333]
+            [             0.125  1.08333333333333              0.0625]
+            [0.0833333333333333            0.0625                1.05]
+
+        A logarithm of a rotation matrix, giving back the angle of
+        the rotation::
+
+            >>> t = 3.7
+            >>> A = matrix([[cos(t),sin(t)],[-sin(t),cos(t)]])
+            >>> chop(logm(A))
+            [             0.0  -2.58318530717959]
+            [2.58318530717959                0.0]
+            >>> (2*pi-t)
+            2.58318530717959
+
+        For some matrices, a logarithm does not exist::
+
+            >>> logm([[1,0], [0,0]])
+            Traceback (most recent call last):
+              ...
+            ZeroDivisionError: matrix is numerically singular
+
+        Logarithm of a matrix with large entries::
+
+            >>> logm(hilbert(3) * 10**20).apply(re)
+            [ 45.5597513593433  1.27721006042799  0.317662687717978]
+            [ 1.27721006042799  42.5222778973542   2.24003708791604]
+            [0.317662687717978  2.24003708791604    42.395212822267]
+
+        """
+        A = ctx.matrix(A)
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            tol = ctx.eps * 128
+            I = A**0
+            B = A
+            n = 0
+            while 1:
+                B = ctx.sqrtm(B)
+                n += 1
+                if ctx.mnorm(B-I, 'inf') < 0.125:
+                    break
+            T = X = B-I
+            L = X*0
+            k = 1
+            while 1:
+                if k & 1:
+                    L += T / k
+                else:
+                    L -= T / k
+                T *= X
+                if ctx.mnorm(T, 'inf') < tol:
+                    break
+                k += 1
+                if k > ctx.prec:
+                    raise ctx.NoConvergence
+        finally:
+            ctx.prec = prec
+        L *= 2**n
+        return L
+
+    def powm(ctx, A, r):
+        r"""
+        Computes `A^r = \exp(A \log r)` for a matrix `A` and complex
+        number `r`.
+
+        **Examples**
+
+        Powers and inverse powers of a matrix::
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = True
+            >>> A = matrix([[4,1,4],[7,8,9],[10,2,11]])
+            >>> powm(A, 2)
+            [ 63.0  20.0   69.0]
+            [174.0  89.0  199.0]
+            [164.0  48.0  179.0]
+            >>> chop(powm(powm(A, 4), 1/4.))
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+            >>> powm(extraprec(20)(powm)(A, -4), -1/4.)
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+            >>> chop(powm(powm(A, 1+0.5j), 1/(1+0.5j)))
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+            >>> powm(extraprec(5)(powm)(A, -1.5), -1/(1.5))
+            [ 4.0  1.0   4.0]
+            [ 7.0  8.0   9.0]
+            [10.0  2.0  11.0]
+
+        A Fibonacci-generating matrix::
+
+            >>> powm([[1,1],[1,0]], 10)
+            [89.0  55.0]
+            [55.0  34.0]
+            >>> fib(10)
+            55.0
+            >>> powm([[1,1],[1,0]], 6.5)
+            [(16.5166626964253 - 0.0121089837381789j)  (10.2078589271083 + 0.0195927472575932j)]
+            [(10.2078589271083 + 0.0195927472575932j)  (6.30880376931698 - 0.0317017309957721j)]
+            >>> (phi**6.5 - (1-phi)**6.5)/sqrt(5)
+            (10.2078589271083 - 0.0195927472575932j)
+            >>> powm([[1,1],[1,0]], 6.2)
+            [ (14.3076953002666 - 0.008222855781077j)  (8.81733464837593 + 0.0133048601383712j)]
+            [(8.81733464837593 + 0.0133048601383712j)  (5.49036065189071 - 0.0215277159194482j)]
+            >>> (phi**6.2 - (1-phi)**6.2)/sqrt(5)
+            (8.81733464837593 - 0.0133048601383712j)
+
+        """
+        A = ctx.matrix(A)
+        r = ctx.convert(r)
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            if ctx.isint(r):
+                v = A ** int(r)
+            elif ctx.isint(r*2):
+                y = int(r*2)
+                v = ctx.sqrtm(A) ** y
+            else:
+                v = ctx.expm(r*ctx.logm(A))
+        finally:
+            ctx.prec = prec
+        v *= 1
+        return v
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py
new file mode 100644
index 0000000000000000000000000000000000000000..885d604203195b695183329acc637de91aeaf5ea
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py
@@ -0,0 +1,877 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+##################################################################################################
+#     module for the eigenvalue problem
+#       Copyright 2013 Timo Hartmann (thartmann15 at gmail.com)
+#
+# todo:
+#  - implement balancing
+#  - agressive early deflation
+#
+##################################################################################################
+
+"""
+The eigenvalue problem
+----------------------
+
+This file contains routines for the eigenvalue problem.
+
+high level routines:
+
+  hessenberg : reduction of a real or complex square matrix to upper Hessenberg form
+  schur : reduction of a real or complex square matrix to upper Schur form
+  eig : eigenvalues and eigenvectors of a real or complex square matrix
+
+low level routines:
+
+  hessenberg_reduce_0 : reduction of a real or complex square matrix to upper Hessenberg form
+  hessenberg_reduce_1 : auxiliary routine to hessenberg_reduce_0
+  qr_step : a single implicitly shifted QR step for an upper Hessenberg matrix
+  hessenberg_qr : Schur decomposition of an upper Hessenberg matrix
+  eig_tr_r : right eigenvectors of an upper triangular matrix
+  eig_tr_l : left  eigenvectors of an upper triangular matrix
+"""
+
+from ..libmp.backend import xrange
+
+class Eigen(object):
+    pass
+
+def defun(f):
+    setattr(Eigen, f.__name__, f)
+    return f
+
+def hessenberg_reduce_0(ctx, A, T):
+    """
+    This routine computes the (upper) Hessenberg decomposition of a square matrix A.
+    Given A, an unitary matrix Q is calculated such that
+
+               Q' A Q = H              and             Q' Q = Q Q' = 1
+
+    where H is an upper Hessenberg matrix, meaning that it only contains zeros
+    below the first subdiagonal. Here ' denotes the hermitian transpose (i.e.
+    transposition and conjugation).
+
+    parameters:
+      A         (input/output) On input, A contains the square matrix A of
+                dimension (n,n). On output, A contains a compressed representation
+                of Q and H.
+      T         (output) An array of length n containing the first elements of
+                the Householder reflectors.
+    """
+
+    # internally we work with householder reflections from the right.
+    # let u be a row vector (i.e. u[i]=A[i,:i]). then
+    # Q is build up by reflectors of the type (1-v'v) where v is a suitable
+    # modification of u. these reflectors are applyed to A from the right.
+    # because we work with reflectors from the right we have to start with
+    # the bottom row of A and work then upwards (this corresponds to
+    # some kind of RQ decomposition).
+    # the first part of the vectors v (i.e. A[i,:(i-1)]) are stored as row vectors
+    # in the lower left part of A (excluding the diagonal and subdiagonal).
+    # the last entry of v is stored in T.
+    # the upper right part of A (including diagonal and subdiagonal) becomes H.
+
+
+    n = A.rows
+    if n <= 2: return
+
+    for i in xrange(n-1, 1, -1):
+
+        # scale the vector
+
+        scale = 0
+        for k in xrange(0, i):
+            scale += abs(ctx.re(A[i,k])) + abs(ctx.im(A[i,k]))
+
+        scale_inv = 0
+        if scale != 0:
+            scale_inv = 1 / scale
+
+        if scale == 0 or ctx.isinf(scale_inv):
+            # sadly there are floating point numbers not equal to zero whose reciprocal is infinity
+            T[i] = 0
+            A[i,i-1] = 0
+            continue
+
+        # calculate parameters for housholder transformation
+
+        H = 0
+        for k in xrange(0, i):
+            A[i,k] *= scale_inv
+            rr = ctx.re(A[i,k])
+            ii = ctx.im(A[i,k])
+            H += rr * rr + ii * ii
+
+        F = A[i,i-1]
+        f = abs(F)
+        G = ctx.sqrt(H)
+        A[i,i-1] = - G * scale
+
+        if f == 0:
+            T[i] = G
+        else:
+            ff = F / f
+            T[i] = F + G * ff
+            A[i,i-1] *= ff
+
+        H += G * f
+        H = 1 / ctx.sqrt(H)
+
+        T[i] *= H
+        for k in xrange(0, i - 1):
+            A[i,k] *= H
+
+        for j in xrange(0, i):
+            # apply housholder transformation (from right)
+
+            G = ctx.conj(T[i]) * A[j,i-1]
+            for k in xrange(0, i-1):
+                G += ctx.conj(A[i,k]) * A[j,k]
+
+            A[j,i-1] -= G * T[i]
+            for k in xrange(0, i-1):
+                A[j,k] -= G * A[i,k]
+
+        for j in xrange(0, n):
+            # apply housholder transformation (from left)
+
+            G = T[i] * A[i-1,j]
+            for k in xrange(0, i-1):
+                G += A[i,k] * A[k,j]
+
+            A[i-1,j] -= G * ctx.conj(T[i])
+            for k in xrange(0, i-1):
+                A[k,j] -= G * ctx.conj(A[i,k])
+
+
+
+def hessenberg_reduce_1(ctx, A, T):
+    """
+    This routine forms the unitary matrix Q described in hessenberg_reduce_0.
+
+    parameters:
+      A    (input/output) On input, A is the same matrix as delivered by
+           hessenberg_reduce_0. On output, A is set to Q.
+
+      T    (input) On input, T is the same array as delivered by hessenberg_reduce_0.
+    """
+
+    n = A.rows
+
+    if n == 1:
+        A[0,0] = 1
+        return
+
+    A[0,0] = A[1,1] = 1
+    A[0,1] = A[1,0] = 0
+
+    for i in xrange(2, n):
+        if T[i] != 0:
+
+            for j in xrange(0, i):
+                G = T[i] * A[i-1,j]
+                for k in xrange(0, i-1):
+                    G += A[i,k] * A[k,j]
+
+                A[i-1,j] -= G * ctx.conj(T[i])
+                for k in xrange(0, i-1):
+                    A[k,j] -= G * ctx.conj(A[i,k])
+
+        A[i,i] = 1
+        for j in xrange(0, i):
+            A[j,i] = A[i,j] = 0
+
+
+
+@defun
+def hessenberg(ctx, A, overwrite_a = False):
+    """
+    This routine computes the Hessenberg decomposition of a square matrix A.
+    Given A, an unitary matrix Q is determined such that
+
+          Q' A Q = H                and               Q' Q = Q Q' = 1
+
+    where H is an upper right Hessenberg matrix. Here ' denotes the hermitian
+    transpose (i.e. transposition and conjugation).
+
+    input:
+      A            : a real or complex square matrix
+      overwrite_a  : if true, allows modification of A which may improve
+                     performance. if false, A is not modified.
+
+    output:
+      Q : an unitary matrix
+      H : an upper right Hessenberg matrix
+
+    example:
+      >>> from mpmath import mp
+      >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]])
+      >>> Q, H = mp.hessenberg(A)
+      >>> mp.nprint(H, 3) # doctest:+SKIP
+      [  3.15  2.23  4.44]
+      [-0.769  4.85  3.05]
+      [   0.0  3.61   7.0]
+      >>> print(mp.chop(A - Q * H * Q.transpose_conj()))
+      [0.0  0.0  0.0]
+      [0.0  0.0  0.0]
+      [0.0  0.0  0.0]
+
+    return value:   (Q, H)
+    """
+
+    n = A.rows
+
+    if n == 1:
+        return (ctx.matrix([[1]]), A)
+
+    if not overwrite_a:
+        A = A.copy()
+
+    T = ctx.matrix(n, 1)
+
+    hessenberg_reduce_0(ctx, A, T)
+    Q = A.copy()
+    hessenberg_reduce_1(ctx, Q, T)
+
+    for x in xrange(n):
+        for y in xrange(x+2, n):
+            A[y,x] = 0
+
+    return Q, A
+
+
+###########################################################################
+
+
+def qr_step(ctx, n0, n1, A, Q, shift):
+    """
+    This subroutine executes a single implicitly shifted QR step applied to an
+    upper Hessenberg matrix A. Given A and shift as input, first an QR
+    decomposition is calculated:
+
+      Q R = A - shift * 1 .
+
+    The output is then following matrix:
+
+      R Q + shift * 1
+
+    parameters:
+      n0, n1    (input) Two integers which specify the submatrix A[n0:n1,n0:n1]
+                on which this subroutine operators. The subdiagonal elements
+                to the left and below this submatrix must be deflated (i.e. zero).
+                following restriction is imposed: n1>=n0+2
+      A         (input/output) On input, A is an upper Hessenberg matrix.
+                On output, A is replaced by "R Q + shift * 1"
+      Q         (input/output) The parameter Q is multiplied by the unitary matrix
+                Q arising from the QR decomposition. Q can also be false, in which
+                case the unitary matrix Q is not computated.
+      shift     (input) a complex number specifying the shift. idealy close to an
+                eigenvalue of the bottemmost part of the submatrix A[n0:n1,n0:n1].
+
+    references:
+      Stoer, Bulirsch - Introduction to Numerical Analysis.
+      Kresser : Numerical Methods for General and Structured Eigenvalue Problems
+    """
+
+    # implicitly shifted and bulge chasing is explained at p.398/399 in "Stoer, Bulirsch - Introduction to Numerical Analysis"
+    # for bulge chasing see also "Watkins - The Matrix Eigenvalue Problem" sec.4.5,p.173
+
+    # the Givens rotation we used is determined as follows: let c,s be two complex
+    # numbers. then we have following relation:
+    #
+    #     v = sqrt(|c|^2 + |s|^2)
+    #
+    #     1/v [ c~  s~]  [c] = [v]
+    #         [-s   c ]  [s]   [0]
+    #
+    # the matrix on the left is our Givens rotation.
+
+    n = A.rows
+
+    # first step
+
+    # calculate givens rotation
+    c = A[n0  ,n0] - shift
+    s = A[n0+1,n0]
+
+    v = ctx.hypot(ctx.hypot(ctx.re(c), ctx.im(c)), ctx.hypot(ctx.re(s), ctx.im(s)))
+
+    if v == 0:
+        v = 1
+        c = 1
+        s = 0
+    else:
+        c /= v
+        s /= v
+
+    cc = ctx.conj(c)
+    cs = ctx.conj(s)
+
+    for k in xrange(n0, n):
+        # apply givens rotation from the left
+        x = A[n0  ,k]
+        y = A[n0+1,k]
+        A[n0  ,k] = cc * x + cs * y
+        A[n0+1,k] = c * y - s * x
+
+    for k in xrange(min(n1, n0+3)):
+        # apply givens rotation from the right
+        x = A[k,n0  ]
+        y = A[k,n0+1]
+        A[k,n0  ] = c * x + s * y
+        A[k,n0+1] = cc * y - cs * x
+
+    if not isinstance(Q, bool):
+        for k in xrange(n):
+            # eigenvectors
+            x = Q[k,n0  ]
+            y = Q[k,n0+1]
+            Q[k,n0  ] = c * x + s * y
+            Q[k,n0+1] = cc * y - cs * x
+
+    # chase the bulge
+
+    for j in xrange(n0, n1 - 2):
+        # calculate givens rotation
+
+        c = A[j+1,j]
+        s = A[j+2,j]
+
+        v = ctx.hypot(ctx.hypot(ctx.re(c), ctx.im(c)), ctx.hypot(ctx.re(s), ctx.im(s)))
+
+        if v == 0:
+            A[j+1,j] = 0
+            v = 1
+            c = 1
+            s = 0
+        else:
+            A[j+1,j] = v
+            c /= v
+            s /= v
+
+        A[j+2,j] = 0
+
+        cc = ctx.conj(c)
+        cs = ctx.conj(s)
+
+        for k in xrange(j+1, n):
+            # apply givens rotation from the left
+            x = A[j+1,k]
+            y = A[j+2,k]
+            A[j+1,k] = cc * x + cs * y
+            A[j+2,k] = c * y - s * x
+
+        for k in xrange(0, min(n1, j+4)):
+            # apply givens rotation from the right
+            x = A[k,j+1]
+            y = A[k,j+2]
+            A[k,j+1] = c * x + s * y
+            A[k,j+2] = cc * y - cs * x
+
+        if not isinstance(Q, bool):
+            for k in xrange(0, n):
+                # eigenvectors
+                x = Q[k,j+1]
+                y = Q[k,j+2]
+                Q[k,j+1] = c * x + s * y
+                Q[k,j+2] = cc * y - cs * x
+
+
+
+def hessenberg_qr(ctx, A, Q):
+    """
+    This routine computes the Schur decomposition of an upper Hessenberg matrix A.
+    Given A, an unitary matrix Q is determined such that
+
+          Q' A Q = R                   and                  Q' Q = Q Q' = 1
+
+    where R is an upper right triangular matrix. Here ' denotes the hermitian
+    transpose (i.e. transposition and conjugation).
+
+    parameters:
+      A         (input/output) On input, A contains an upper Hessenberg matrix.
+                On output, A is replace by the upper right triangluar matrix R.
+
+      Q         (input/output) The parameter Q is multiplied by the unitary
+                matrix Q arising from the Schur decomposition. Q can also be
+                false, in which case the unitary matrix Q is not computated.
+    """
+
+    n = A.rows
+
+    norm = 0
+    for x in xrange(n):
+        for y in xrange(min(x+2, n)):
+            norm += ctx.re(A[y,x]) ** 2 + ctx.im(A[y,x]) ** 2
+    norm = ctx.sqrt(norm) / n
+
+    if norm == 0:
+        return
+
+    n0 = 0
+    n1 = n
+
+    eps = ctx.eps / (100 * n)
+    maxits = ctx.dps * 4
+
+    its = totalits = 0
+
+    while 1:
+        # kressner p.32 algo 3
+        # the active submatrix is A[n0:n1,n0:n1]
+
+        k = n0
+
+        while k + 1 < n1:
+            s = abs(ctx.re(A[k,k])) + abs(ctx.im(A[k,k])) + abs(ctx.re(A[k+1,k+1])) + abs(ctx.im(A[k+1,k+1]))
+            if s < eps * norm:
+                s = norm
+            if abs(A[k+1,k]) < eps * s:
+                break
+            k += 1
+
+        if k + 1 < n1:
+            # deflation found at position (k+1, k)
+
+            A[k+1,k] = 0
+            n0 = k + 1
+
+            its = 0
+
+            if n0 + 1 >= n1:
+                # block of size at most two has converged
+                n0 = 0
+                n1 = k + 1
+                if n1 < 2:
+                    # QR algorithm has converged
+                    return
+        else:
+            if (its % 30) == 10:
+                # exceptional shift
+                shift = A[n1-1,n1-2]
+            elif (its % 30) == 20:
+                # exceptional shift
+                shift = abs(A[n1-1,n1-2])
+            elif (its % 30) == 29:
+                # exceptional shift
+                shift = norm
+            else:
+                #    A = [ a b ]       det(x-A)=x*x-x*tr(A)+det(A)
+                #        [ c d ]
+                #
+                # eigenvalues bad:   (tr(A)+sqrt((tr(A))**2-4*det(A)))/2
+                #     bad because of cancellation if |c| is small and |a-d| is small, too.
+                #
+                # eigenvalues good:     (a+d+sqrt((a-d)**2+4*b*c))/2
+
+                t =  A[n1-2,n1-2] + A[n1-1,n1-1]
+                s = (A[n1-1,n1-1] - A[n1-2,n1-2]) ** 2 + 4 * A[n1-1,n1-2] * A[n1-2,n1-1]
+                if ctx.re(s) > 0:
+                    s = ctx.sqrt(s)
+                else:
+                    s = ctx.sqrt(-s) * 1j
+                a = (t + s) / 2
+                b = (t - s) / 2
+                if abs(A[n1-1,n1-1] - a) > abs(A[n1-1,n1-1] - b):
+                    shift = b
+                else:
+                    shift = a
+
+            its += 1
+            totalits += 1
+
+            qr_step(ctx, n0, n1, A, Q, shift)
+
+            if its > maxits:
+                raise RuntimeError("qr: failed to converge after %d steps" % its)
+
+
+@defun
+def schur(ctx, A, overwrite_a = False):
+    """
+    This routine computes the Schur decomposition of a square matrix A.
+    Given A, an unitary matrix Q is determined such that
+
+          Q' A Q = R                and               Q' Q = Q Q' = 1
+
+    where R is an upper right triangular matrix. Here ' denotes the
+    hermitian transpose (i.e. transposition and conjugation).
+
+    input:
+      A            : a real or complex square matrix
+      overwrite_a  : if true, allows modification of A which may improve
+                     performance. if false, A is not modified.
+
+    output:
+      Q : an unitary matrix
+      R : an upper right triangular matrix
+
+    return value:   (Q, R)
+
+    example:
+      >>> from mpmath import mp
+      >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]])
+      >>> Q, R = mp.schur(A)
+      >>> mp.nprint(R, 3) # doctest:+SKIP
+      [2.0  0.417  -2.53]
+      [0.0    4.0  -4.74]
+      [0.0    0.0    9.0]
+      >>> print(mp.chop(A - Q * R * Q.transpose_conj()))
+      [0.0  0.0  0.0]
+      [0.0  0.0  0.0]
+      [0.0  0.0  0.0]
+
+    warning: The Schur decomposition is not unique.
+    """
+
+    n = A.rows
+
+    if n == 1:
+        return (ctx.matrix([[1]]), A)
+
+    if not overwrite_a:
+        A = A.copy()
+
+    T = ctx.matrix(n, 1)
+
+    hessenberg_reduce_0(ctx, A, T)
+    Q = A.copy()
+    hessenberg_reduce_1(ctx, Q, T)
+
+    for x in xrange(n):
+        for y in xrange(x + 2, n):
+            A[y,x] = 0
+
+    hessenberg_qr(ctx, A, Q)
+
+    return Q, A
+
+
+def eig_tr_r(ctx, A):
+    """
+    This routine calculates the right eigenvectors of an upper right triangular matrix.
+
+    input:
+      A      an upper right triangular matrix
+
+    output:
+      ER     a matrix whose columns form the right eigenvectors of A
+
+    return value: ER
+    """
+
+    # this subroutine is inspired by the lapack routines ctrevc.f,clatrs.f
+
+    n = A.rows
+
+    ER = ctx.eye(n)
+
+    eps = ctx.eps
+
+    unfl = ctx.ldexp(ctx.one, -ctx.prec * 30)
+    # since mpmath effectively has no limits on the exponent, we simply scale doubles up
+    # original double has prec*20
+
+    smlnum = unfl * (n / eps)
+    simin = 1 / ctx.sqrt(eps)
+
+    rmax = 1
+
+    for i in xrange(1, n):
+        s = A[i,i]
+
+        smin = max(eps * abs(s), smlnum)
+
+        for j in xrange(i - 1, -1, -1):
+
+            r = 0
+            for k in xrange(j + 1, i + 1):
+                r += A[j,k] * ER[k,i]
+
+            t = A[j,j] - s
+            if abs(t) < smin:
+                t = smin
+
+            r = -r / t
+            ER[j,i] = r
+
+            rmax = max(rmax, abs(r))
+            if rmax > simin:
+                for k in xrange(j, i+1):
+                    ER[k,i] /= rmax
+                rmax = 1
+
+        if rmax != 1:
+            for k in xrange(0, i + 1):
+                ER[k,i] /= rmax
+
+    return ER
+
+def eig_tr_l(ctx, A):
+    """
+    This routine calculates the left eigenvectors of an upper right triangular matrix.
+
+    input:
+      A      an upper right triangular matrix
+
+    output:
+      EL     a matrix whose rows form the left eigenvectors of A
+
+    return value:  EL
+    """
+
+    n = A.rows
+
+    EL = ctx.eye(n)
+
+    eps = ctx.eps
+
+    unfl = ctx.ldexp(ctx.one, -ctx.prec * 30)
+    # since mpmath effectively has no limits on the exponent, we simply scale doubles up
+    # original double has prec*20
+
+    smlnum = unfl * (n / eps)
+    simin = 1 / ctx.sqrt(eps)
+
+    rmax = 1
+
+    for i in xrange(0, n - 1):
+        s = A[i,i]
+
+        smin = max(eps * abs(s), smlnum)
+
+        for j in xrange(i + 1, n):
+
+            r = 0
+            for k in xrange(i, j):
+                r += EL[i,k] * A[k,j]
+
+            t = A[j,j] - s
+            if abs(t) < smin:
+                t = smin
+
+            r = -r / t
+            EL[i,j] = r
+
+            rmax = max(rmax, abs(r))
+            if rmax > simin:
+                for k in xrange(i, j + 1):
+                    EL[i,k] /= rmax
+                rmax = 1
+
+        if rmax != 1:
+            for k in xrange(i, n):
+                EL[i,k] /= rmax
+
+    return EL
+
+@defun
+def eig(ctx, A, left = False, right = True, overwrite_a = False):
+    """
+    This routine computes the eigenvalues and optionally the left and right
+    eigenvectors of a square matrix A. Given A, a vector E and matrices ER
+    and EL are calculated such that
+
+                        A ER[:,i] =         E[i] ER[:,i]
+                EL[i,:] A         = EL[i,:] E[i]
+
+    E contains the eigenvalues of A. The columns of ER contain the right eigenvectors
+    of A whereas the rows of EL contain the left eigenvectors.
+
+
+    input:
+      A           : a real or complex square matrix of shape (n, n)
+      left        : if true, the left eigenvectors are calculated.
+      right       : if true, the right eigenvectors are calculated.
+      overwrite_a : if true, allows modification of A which may improve
+                    performance. if false, A is not modified.
+
+    output:
+      E    : a list of length n containing the eigenvalues of A.
+      ER   : a matrix whose columns contain the right eigenvectors of A.
+      EL   : a matrix whose rows contain the left eigenvectors of A.
+
+    return values:
+       E            if left and right are both false.
+      (E, ER)       if right is true and left is false.
+      (E, EL)       if left is true and right is false.
+      (E, EL, ER)   if left and right are true.
+
+
+    examples:
+      >>> from mpmath import mp
+      >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]])
+      >>> E, ER = mp.eig(A)
+      >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0]))
+      [0.0]
+      [0.0]
+      [0.0]
+
+      >>> E, EL, ER = mp.eig(A,left = True, right = True)
+      >>> E, EL, ER = mp.eig_sort(E, EL, ER)
+      >>> mp.nprint(E)
+      [2.0, 4.0, 9.0]
+      >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0]))
+      [0.0]
+      [0.0]
+      [0.0]
+      >>> print(mp.chop( EL[0,:] * A - EL[0,:] * E[0]))
+      [0.0  0.0  0.0]
+
+    warning:
+     - If there are multiple eigenvalues, the eigenvectors do not necessarily
+       span the whole vectorspace, i.e. ER and EL may have not full rank.
+       Furthermore in that case the eigenvectors are numerical ill-conditioned.
+     - In the general case the eigenvalues have no natural order.
+
+    see also:
+      - eigh (or eigsy, eighe) for the symmetric eigenvalue problem.
+      - eig_sort for sorting of eigenvalues and eigenvectors
+    """
+
+    n = A.rows
+
+    if n == 1:
+        if left and (not right):
+            return ([A[0]], ctx.matrix([[1]]))
+
+        if right and (not left):
+            return ([A[0]], ctx.matrix([[1]]))
+
+        return ([A[0]], ctx.matrix([[1]]), ctx.matrix([[1]]))
+
+    if not overwrite_a:
+        A = A.copy()
+
+    T = ctx.zeros(n, 1)
+
+    hessenberg_reduce_0(ctx, A, T)
+
+    if left or right:
+        Q = A.copy()
+        hessenberg_reduce_1(ctx, Q, T)
+    else:
+        Q = False
+
+    for x in xrange(n):
+        for y in xrange(x + 2, n):
+            A[y,x] = 0
+
+    hessenberg_qr(ctx, A, Q)
+
+    E = [0 for i in xrange(n)]
+    for i in xrange(n):
+        E[i] = A[i,i]
+
+    if not (left or right):
+        return E
+
+    if left:
+        EL = eig_tr_l(ctx, A)
+        EL = EL * Q.transpose_conj()
+
+    if right:
+        ER = eig_tr_r(ctx, A)
+        ER = Q * ER
+
+    if left and (not right):
+        return (E, EL)
+
+    if right and (not left):
+        return (E, ER)
+
+    return (E, EL, ER)
+
+@defun
+def eig_sort(ctx, E, EL = False, ER = False, f = "real"):
+    """
+    This routine sorts the eigenvalues and eigenvectors delivered by ``eig``.
+
+    parameters:
+      E  : the eigenvalues as delivered by eig
+      EL : the left  eigenvectors as delivered by eig, or false
+      ER : the right eigenvectors as delivered by eig, or false
+      f  : either a string ("real" sort by increasing real part, "imag" sort by
+           increasing imag part, "abs" sort by absolute value) or a function
+           mapping complexs to the reals, i.e. ``f = lambda x: -mp.re(x) ``
+           would sort the eigenvalues by decreasing real part.
+
+    return values:
+       E            if EL and ER are both false.
+      (E, ER)       if ER is not false and left is false.
+      (E, EL)       if EL is not false and right is false.
+      (E, EL, ER)   if EL and ER are not false.
+
+    example:
+      >>> from mpmath import mp
+      >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]])
+      >>> E, EL, ER = mp.eig(A,left = True, right = True)
+      >>> E, EL, ER = mp.eig_sort(E, EL, ER)
+      >>> mp.nprint(E)
+      [2.0, 4.0, 9.0]
+      >>> E, EL, ER = mp.eig_sort(E, EL, ER,f = lambda x: -mp.re(x))
+      >>> mp.nprint(E)
+      [9.0, 4.0, 2.0]
+      >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0]))
+      [0.0]
+      [0.0]
+      [0.0]
+      >>> print(mp.chop( EL[0,:] * A - EL[0,:] * E[0]))
+      [0.0  0.0  0.0]
+    """
+
+    if isinstance(f, str):
+        if f == "real":
+            f = ctx.re
+        elif f == "imag":
+            f = ctx.im
+        elif f == "abs":
+            f = abs
+        else:
+            raise RuntimeError("unknown function %s" % f)
+
+    n = len(E)
+
+    # Sort eigenvalues (bubble-sort)
+
+    for i in xrange(n):
+        imax = i
+        s = f(E[i])         # s is the current maximal element
+
+        for j in xrange(i + 1, n):
+            c = f(E[j])
+            if c < s:
+                s = c
+                imax = j
+
+        if imax != i:
+            # swap eigenvalues
+
+            z = E[i]
+            E[i] = E[imax]
+            E[imax] = z
+
+            if not isinstance(EL, bool):
+                for j in xrange(n):
+                    z = EL[i,j]
+                    EL[i,j] = EL[imax,j]
+                    EL[imax,j] = z
+
+            if not isinstance(ER, bool):
+                for j in xrange(n):
+                    z = ER[j,i]
+                    ER[j,i] = ER[j,imax]
+                    ER[j,imax] = z
+
+    if isinstance(EL, bool) and isinstance(ER, bool):
+        return E
+
+    if isinstance(EL, bool) and not(isinstance(ER, bool)):
+        return (E, ER)
+
+    if isinstance(ER, bool) and not(isinstance(EL, bool)):
+        return (E, EL)
+
+    return (E, EL, ER)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2fe643e809822e3d05a52b73c965edb622f9af9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py
@@ -0,0 +1,790 @@
+"""
+Linear algebra
+--------------
+
+Linear equations
+................
+
+Basic linear algebra is implemented; you can for example solve the linear
+equation system::
+
+      x + 2*y = -10
+    3*x + 4*y =  10
+
+using ``lu_solve``::
+
+    >>> from mpmath import *
+    >>> mp.pretty = False
+    >>> A = matrix([[1, 2], [3, 4]])
+    >>> b = matrix([-10, 10])
+    >>> x = lu_solve(A, b)
+    >>> x
+    matrix(
+    [['30.0'],
+     ['-20.0']])
+
+If you don't trust the result, use ``residual`` to calculate the residual ||A*x-b||::
+
+    >>> residual(A, x, b)
+    matrix(
+    [['3.46944695195361e-18'],
+     ['3.46944695195361e-18']])
+    >>> str(eps)
+    '2.22044604925031e-16'
+
+As you can see, the solution is quite accurate. The error is caused by the
+inaccuracy of the internal floating point arithmetic. Though, it's even smaller
+than the current machine epsilon, which basically means you can trust the
+result.
+
+If you need more speed, use NumPy, or ``fp.lu_solve`` for a floating-point computation.
+
+    >>> fp.lu_solve(A, b)   # doctest: +ELLIPSIS
+    matrix(...)
+
+``lu_solve`` accepts overdetermined systems. It is usually not possible to solve
+such systems, so the residual is minimized instead. Internally this is done
+using Cholesky decomposition to compute a least squares approximation. This means
+that that ``lu_solve`` will square the errors. If you can't afford this, use
+``qr_solve`` instead. It is twice as slow but more accurate, and it calculates
+the residual automatically.
+
+
+Matrix factorization
+....................
+
+The function ``lu`` computes an explicit LU factorization of a matrix::
+
+    >>> P, L, U = lu(matrix([[0,2,3],[4,5,6],[7,8,9]]))
+    >>> print(P)
+    [0.0  0.0  1.0]
+    [1.0  0.0  0.0]
+    [0.0  1.0  0.0]
+    >>> print(L)
+    [              1.0                0.0  0.0]
+    [              0.0                1.0  0.0]
+    [0.571428571428571  0.214285714285714  1.0]
+    >>> print(U)
+    [7.0  8.0                9.0]
+    [0.0  2.0                3.0]
+    [0.0  0.0  0.214285714285714]
+    >>> print(P.T*L*U)
+    [0.0  2.0  3.0]
+    [4.0  5.0  6.0]
+    [7.0  8.0  9.0]
+
+Interval matrices
+-----------------
+
+Matrices may contain interval elements. This allows one to perform
+basic linear algebra operations such as matrix multiplication
+and equation solving with rigorous error bounds::
+
+    >>> a = iv.matrix([['0.1','0.3','1.0'],
+    ...             ['7.1','5.5','4.8'],
+    ...             ['3.2','4.4','5.6']])
+    >>>
+    >>> b = iv.matrix(['4','0.6','0.5'])
+    >>> c = iv.lu_solve(a, b)
+    >>> print(c)
+    [   [5.2582327113062568605927528666, 5.25823271130625686059275702219]]
+    [[-13.1550493962678375411635581388, -13.1550493962678375411635540152]]
+    [  [7.42069154774972557628979076189, 7.42069154774972557628979190734]]
+    >>> print(a*c)
+    [  [3.99999999999999999999999844904, 4.00000000000000000000000155096]]
+    [[0.599999999999999999999968898009, 0.600000000000000000000031763736]]
+    [[0.499999999999999999999979320485, 0.500000000000000000000020679515]]
+"""
+
+# TODO:
+# *implement high-level qr()
+# *test unitvector
+# *iterative solving
+
+from copy import copy
+
+from ..libmp.backend import xrange
+
+class LinearAlgebraMethods(object):
+
+    def LU_decomp(ctx, A, overwrite=False, use_cache=True):
+        """
+        LU-factorization of a n*n matrix using the Gauss algorithm.
+        Returns L and U in one matrix and the pivot indices.
+
+        Use overwrite to specify whether A will be overwritten with L and U.
+        """
+        if not A.rows == A.cols:
+            raise ValueError('need n*n matrix')
+        # get from cache if possible
+        if use_cache and isinstance(A, ctx.matrix) and A._LU:
+            return A._LU
+        if not overwrite:
+            orig = A
+            A = A.copy()
+        tol = ctx.absmin(ctx.mnorm(A,1) * ctx.eps) # each pivot element has to be bigger
+        n = A.rows
+        p = [None]*(n - 1)
+        for j in xrange(n - 1):
+            # pivoting, choose max(abs(reciprocal row sum)*abs(pivot element))
+            biggest = 0
+            for k in xrange(j, n):
+                s = ctx.fsum([ctx.absmin(A[k,l]) for l in xrange(j, n)])
+                if ctx.absmin(s) <= tol:
+                    raise ZeroDivisionError('matrix is numerically singular')
+                current = 1/s * ctx.absmin(A[k,j])
+                if current > biggest: # TODO: what if equal?
+                    biggest = current
+                    p[j] = k
+            # swap rows according to p
+            ctx.swap_row(A, j, p[j])
+            if ctx.absmin(A[j,j]) <= tol:
+                raise ZeroDivisionError('matrix is numerically singular')
+            # calculate elimination factors and add rows
+            for i in xrange(j + 1, n):
+                A[i,j] /= A[j,j]
+                for k in xrange(j + 1, n):
+                    A[i,k] -= A[i,j]*A[j,k]
+        if ctx.absmin(A[n - 1,n - 1]) <= tol:
+            raise ZeroDivisionError('matrix is numerically singular')
+        # cache decomposition
+        if not overwrite and isinstance(orig, ctx.matrix):
+            orig._LU = (A, p)
+        return A, p
+
+    def L_solve(ctx, L, b, p=None):
+        """
+        Solve the lower part of a LU factorized matrix for y.
+        """
+        if L.rows != L.cols:
+            raise RuntimeError("need n*n matrix")
+        n = L.rows
+        if len(b) != n:
+            raise ValueError("Value should be equal to n")
+        b = copy(b)
+        if p: # swap b according to p
+            for k in xrange(0, len(p)):
+                ctx.swap_row(b, k, p[k])
+        # solve
+        for i in xrange(1, n):
+            for j in xrange(i):
+                b[i] -= L[i,j] * b[j]
+        return b
+
+    def U_solve(ctx, U, y):
+        """
+        Solve the upper part of a LU factorized matrix for x.
+        """
+        if U.rows != U.cols:
+            raise RuntimeError("need n*n matrix")
+        n = U.rows
+        if len(y) != n:
+            raise ValueError("Value should be equal to n")
+        x = copy(y)
+        for i in xrange(n - 1, -1, -1):
+            for j in xrange(i + 1, n):
+                x[i] -= U[i,j] * x[j]
+            x[i] /= U[i,i]
+        return x
+
+    def lu_solve(ctx, A, b, **kwargs):
+        """
+        Ax = b => x
+
+        Solve a determined or overdetermined linear equations system.
+        Fast LU decomposition is used, which is less accurate than QR decomposition
+        (especially for overdetermined systems), but it's twice as efficient.
+        Use qr_solve if you want more precision or have to solve a very ill-
+        conditioned system.
+
+        If you specify real=True, it does not check for overdeterminded complex
+        systems.
+        """
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            # do not overwrite A nor b
+            A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy()
+            if A.rows < A.cols:
+                raise ValueError('cannot solve underdetermined system')
+            if A.rows > A.cols:
+                # use least-squares method if overdetermined
+                # (this increases errors)
+                AH = A.H
+                A = AH * A
+                b = AH * b
+                if (kwargs.get('real', False) or
+                    not sum(type(i) is ctx.mpc for i in A)):
+                    # TODO: necessary to check also b?
+                    x = ctx.cholesky_solve(A, b)
+                else:
+                    x = ctx.lu_solve(A, b)
+            else:
+                # LU factorization
+                A, p = ctx.LU_decomp(A)
+                b = ctx.L_solve(A, b, p)
+                x = ctx.U_solve(A, b)
+        finally:
+            ctx.prec = prec
+        return x
+
+    def improve_solution(ctx, A, x, b, maxsteps=1):
+        """
+        Improve a solution to a linear equation system iteratively.
+
+        This re-uses the LU decomposition and is thus cheap.
+        Usually 3 up to 4 iterations are giving the maximal improvement.
+        """
+        if A.rows != A.cols:
+            raise RuntimeError("need n*n matrix") # TODO: really?
+        for _ in xrange(maxsteps):
+            r = ctx.residual(A, x, b)
+            if ctx.norm(r, 2) < 10*ctx.eps:
+                break
+            # this uses cached LU decomposition and is thus cheap
+            dx = ctx.lu_solve(A, -r)
+            x += dx
+        return x
+
+    def lu(ctx, A):
+        """
+        A -> P, L, U
+
+        LU factorisation of a square matrix A. L is the lower, U the upper part.
+        P is the permutation matrix indicating the row swaps.
+
+        P*A = L*U
+
+        If you need efficiency, use the low-level method LU_decomp instead, it's
+        much more memory efficient.
+        """
+        # get factorization
+        A, p = ctx.LU_decomp(A)
+        n = A.rows
+        L = ctx.matrix(n)
+        U = ctx.matrix(n)
+        for i in xrange(n):
+            for j in xrange(n):
+                if i > j:
+                    L[i,j] = A[i,j]
+                elif i == j:
+                    L[i,j] = 1
+                    U[i,j] = A[i,j]
+                else:
+                    U[i,j] = A[i,j]
+        # calculate permutation matrix
+        P = ctx.eye(n)
+        for k in xrange(len(p)):
+            ctx.swap_row(P, k, p[k])
+        return P, L, U
+
+    def unitvector(ctx, n, i):
+        """
+        Return the i-th n-dimensional unit vector.
+        """
+        assert 0 < i <= n, 'this unit vector does not exist'
+        return [ctx.zero]*(i-1) + [ctx.one] + [ctx.zero]*(n-i)
+
+    def inverse(ctx, A, **kwargs):
+        """
+        Calculate the inverse of a matrix.
+
+        If you want to solve an equation system Ax = b, it's recommended to use
+        solve(A, b) instead, it's about 3 times more efficient.
+        """
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            # do not overwrite A
+            A = ctx.matrix(A, **kwargs).copy()
+            n = A.rows
+            # get LU factorisation
+            A, p = ctx.LU_decomp(A)
+            cols = []
+            # calculate unit vectors and solve corresponding system to get columns
+            for i in xrange(1, n + 1):
+                e = ctx.unitvector(n, i)
+                y = ctx.L_solve(A, e, p)
+                cols.append(ctx.U_solve(A, y))
+            # convert columns to matrix
+            inv = []
+            for i in xrange(n):
+                row = []
+                for j in xrange(n):
+                    row.append(cols[j][i])
+                inv.append(row)
+            result = ctx.matrix(inv, **kwargs)
+        finally:
+            ctx.prec = prec
+        return result
+
+    def householder(ctx, A):
+        """
+        (A|b) -> H, p, x, res
+
+        (A|b) is the coefficient matrix with left hand side of an optionally
+        overdetermined linear equation system.
+        H and p contain all information about the transformation matrices.
+        x is the solution, res the residual.
+        """
+        if not isinstance(A, ctx.matrix):
+            raise TypeError("A should be a type of ctx.matrix")
+        m = A.rows
+        n = A.cols
+        if m < n - 1:
+            raise RuntimeError("Columns should not be less than rows")
+        # calculate Householder matrix
+        p = []
+        for j in xrange(0, n - 1):
+            s = ctx.fsum(abs(A[i,j])**2 for i in xrange(j, m))
+            if not abs(s) > ctx.eps:
+                raise ValueError('matrix is numerically singular')
+            p.append(-ctx.sign(ctx.re(A[j,j])) * ctx.sqrt(s))
+            kappa = ctx.one / (s - p[j] * A[j,j])
+            A[j,j] -= p[j]
+            for k in xrange(j+1, n):
+                y = ctx.fsum(ctx.conj(A[i,j]) * A[i,k] for i in xrange(j, m)) * kappa
+                for i in xrange(j, m):
+                    A[i,k] -= A[i,j] * y
+        # solve Rx = c1
+        x = [A[i,n - 1] for i in xrange(n - 1)]
+        for i in xrange(n - 2, -1, -1):
+            x[i] -= ctx.fsum(A[i,j] * x[j] for j in xrange(i + 1, n - 1))
+            x[i] /= p[i]
+        # calculate residual
+        if not m == n - 1:
+            r = [A[m-1-i, n-1] for i in xrange(m - n + 1)]
+        else:
+            # determined system, residual should be 0
+            r = [0]*m # maybe a bad idea, changing r[i] will change all elements
+        return A, p, x, r
+
+    #def qr(ctx, A):
+    #    """
+    #    A -> Q, R
+    #
+    #    QR factorisation of a square matrix A using Householder decomposition.
+    #    Q is orthogonal, this leads to very few numerical errors.
+    #
+    #    A = Q*R
+    #    """
+    #    H, p, x, res = householder(A)
+    # TODO: implement this
+
+    def residual(ctx, A, x, b, **kwargs):
+        """
+        Calculate the residual of a solution to a linear equation system.
+
+        r = A*x - b for A*x = b
+        """
+        oldprec = ctx.prec
+        try:
+            ctx.prec *= 2
+            A, x, b = ctx.matrix(A, **kwargs), ctx.matrix(x, **kwargs), ctx.matrix(b, **kwargs)
+            return A*x - b
+        finally:
+            ctx.prec = oldprec
+
+    def qr_solve(ctx, A, b, norm=None, **kwargs):
+        """
+        Ax = b => x, ||Ax - b||
+
+        Solve a determined or overdetermined linear equations system and
+        calculate the norm of the residual (error).
+        QR decomposition using Householder factorization is applied, which gives very
+        accurate results even for ill-conditioned matrices. qr_solve is twice as
+        efficient.
+        """
+        if norm is None:
+            norm = ctx.norm
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            # do not overwrite A nor b
+            A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy()
+            if A.rows < A.cols:
+                raise ValueError('cannot solve underdetermined system')
+            H, p, x, r = ctx.householder(ctx.extend(A, b))
+            res = ctx.norm(r)
+            # calculate residual "manually" for determined systems
+            if res == 0:
+                res = ctx.norm(ctx.residual(A, x, b))
+            return ctx.matrix(x, **kwargs), res
+        finally:
+            ctx.prec = prec
+
+    def cholesky(ctx, A, tol=None):
+        r"""
+        Cholesky decomposition of a symmetric positive-definite matrix `A`.
+        Returns a lower triangular matrix `L` such that `A = L \times L^T`.
+        More generally, for a complex Hermitian positive-definite matrix,
+        a Cholesky decomposition satisfying `A = L \times L^H` is returned.
+
+        The Cholesky decomposition can be used to solve linear equation
+        systems twice as efficiently as LU decomposition, or to
+        test whether `A` is positive-definite.
+
+        The optional parameter ``tol`` determines the tolerance for
+        verifying positive-definiteness.
+
+        **Examples**
+
+        Cholesky decomposition of a positive-definite symmetric matrix::
+
+            >>> from mpmath import *
+            >>> mp.dps = 25; mp.pretty = True
+            >>> A = eye(3) + hilbert(3)
+            >>> nprint(A)
+            [     2.0      0.5  0.333333]
+            [     0.5  1.33333      0.25]
+            [0.333333     0.25       1.2]
+            >>> L = cholesky(A)
+            >>> nprint(L)
+            [ 1.41421      0.0      0.0]
+            [0.353553  1.09924      0.0]
+            [0.235702  0.15162  1.05899]
+            >>> chop(A - L*L.T)
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+
+        Cholesky decomposition of a Hermitian matrix::
+
+            >>> A = eye(3) + matrix([[0,0.25j,-0.5j],[-0.25j,0,0],[0.5j,0,0]])
+            >>> L = cholesky(A)
+            >>> nprint(L)
+            [          1.0                0.0                0.0]
+            [(0.0 - 0.25j)  (0.968246 + 0.0j)                0.0]
+            [ (0.0 + 0.5j)  (0.129099 + 0.0j)  (0.856349 + 0.0j)]
+            >>> chop(A - L*L.H)
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+            [0.0  0.0  0.0]
+
+        Attempted Cholesky decomposition of a matrix that is not positive
+        definite::
+
+            >>> A = -eye(3) + hilbert(3)
+            >>> L = cholesky(A)
+            Traceback (most recent call last):
+              ...
+            ValueError: matrix is not positive-definite
+
+        **References**
+
+        1. [Wikipedia]_ http://en.wikipedia.org/wiki/Cholesky_decomposition
+
+        """
+        if not isinstance(A, ctx.matrix):
+            raise RuntimeError("A should be a type of ctx.matrix")
+        if not A.rows == A.cols:
+            raise ValueError('need n*n matrix')
+        if tol is None:
+            tol = +ctx.eps
+        n = A.rows
+        L = ctx.matrix(n)
+        for j in xrange(n):
+            c = ctx.re(A[j,j])
+            if abs(c-A[j,j]) > tol:
+                raise ValueError('matrix is not Hermitian')
+            s = c - ctx.fsum((L[j,k] for k in xrange(j)),
+                absolute=True, squared=True)
+            if s < tol:
+                raise ValueError('matrix is not positive-definite')
+            L[j,j] = ctx.sqrt(s)
+            for i in xrange(j, n):
+                it1 = (L[i,k] for k in xrange(j))
+                it2 = (L[j,k] for k in xrange(j))
+                t = ctx.fdot(it1, it2, conjugate=True)
+                L[i,j] = (A[i,j] - t) / L[j,j]
+        return L
+
+    def cholesky_solve(ctx, A, b, **kwargs):
+        """
+        Ax = b => x
+
+        Solve a symmetric positive-definite linear equation system.
+        This is twice as efficient as lu_solve.
+
+        Typical use cases:
+        * A.T*A
+        * Hessian matrix
+        * differential equations
+        """
+        prec = ctx.prec
+        try:
+            ctx.prec += 10
+            # do not overwrite A nor b
+            A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy()
+            if A.rows !=  A.cols:
+                raise ValueError('can only solve determined system')
+            # Cholesky factorization
+            L = ctx.cholesky(A)
+            # solve
+            n = L.rows
+            if len(b) != n:
+                raise ValueError("Value should be equal to n")
+            for i in xrange(n):
+                b[i] -= ctx.fsum(L[i,j] * b[j] for j in xrange(i))
+                b[i] /= L[i,i]
+            x = ctx.U_solve(L.T, b)
+            return x
+        finally:
+            ctx.prec = prec
+
+    def det(ctx, A):
+        """
+        Calculate the determinant of a matrix.
+        """
+        prec = ctx.prec
+        try:
+            # do not overwrite A
+            A = ctx.matrix(A).copy()
+            # use LU factorization to calculate determinant
+            try:
+                R, p = ctx.LU_decomp(A)
+            except ZeroDivisionError:
+                return 0
+            z = 1
+            for i, e in enumerate(p):
+                if i != e:
+                    z *= -1
+            for i in xrange(A.rows):
+                z *= R[i,i]
+            return z
+        finally:
+            ctx.prec = prec
+
+    def cond(ctx, A, norm=None):
+        """
+        Calculate the condition number of a matrix using a specified matrix norm.
+
+        The condition number estimates the sensitivity of a matrix to errors.
+        Example: small input errors for ill-conditioned coefficient matrices
+        alter the solution of the system dramatically.
+
+        For ill-conditioned matrices it's recommended to use qr_solve() instead
+        of lu_solve(). This does not help with input errors however, it just avoids
+        to add additional errors.
+
+        Definition:    cond(A) = ||A|| * ||A**-1||
+        """
+        if norm is None:
+            norm = lambda x: ctx.mnorm(x,1)
+        return norm(A) * norm(ctx.inverse(A))
+
+    def lu_solve_mat(ctx, a, b):
+        """Solve a * x = b  where a and b are matrices."""
+        r = ctx.matrix(a.rows, b.cols)
+        for i in range(b.cols):
+            c = ctx.lu_solve(a, b.column(i))
+            for j in range(len(c)):
+                r[j, i] = c[j]
+        return r
+
+    def qr(ctx, A, mode = 'full', edps = 10):
+        """
+        Compute a QR factorization $A = QR$ where
+        A is an m x n matrix of real or complex numbers where m >= n
+
+        mode has following meanings:
+        (1) mode = 'raw' returns two matrixes (A, tau) in the
+            internal format used by LAPACK
+        (2) mode = 'skinny' returns the leading n columns of Q
+            and n rows of R
+        (3) Any other value returns the leading m columns of Q
+            and m rows of R
+
+        edps is the increase in mp precision used for calculations
+
+        **Examples**
+
+            >>> from mpmath import *
+            >>> mp.dps = 15
+            >>> mp.pretty = True
+            >>> A = matrix([[1, 2], [3, 4], [1, 1]])
+            >>> Q, R = qr(A)
+            >>> Q
+            [-0.301511344577764   0.861640436855329   0.408248290463863]
+            [-0.904534033733291  -0.123091490979333  -0.408248290463863]
+            [-0.301511344577764  -0.492365963917331   0.816496580927726]
+            >>> R
+            [-3.3166247903554  -4.52267016866645]
+            [             0.0  0.738548945875996]
+            [             0.0                0.0]
+            >>> Q * R
+            [1.0  2.0]
+            [3.0  4.0]
+            [1.0  1.0]
+            >>> chop(Q.T * Q)
+            [1.0  0.0  0.0]
+            [0.0  1.0  0.0]
+            [0.0  0.0  1.0]
+            >>> B = matrix([[1+0j, 2-3j], [3+j, 4+5j]])
+            >>> Q, R = qr(B)
+            >>> nprint(Q)
+            [     (-0.301511 + 0.0j)   (0.0695795 - 0.95092j)]
+            [(-0.904534 - 0.301511j)  (-0.115966 + 0.278318j)]
+            >>> nprint(R)
+            [(-3.31662 + 0.0j)  (-5.72872 - 2.41209j)]
+            [              0.0       (3.91965 + 0.0j)]
+            >>> Q * R
+            [(1.0 + 0.0j)  (2.0 - 3.0j)]
+            [(3.0 + 1.0j)  (4.0 + 5.0j)]
+            >>> chop(Q.T * Q.conjugate())
+            [1.0  0.0]
+            [0.0  1.0]
+
+        """
+
+        # check values before continuing
+        assert isinstance(A, ctx.matrix)
+        m = A.rows
+        n = A.cols
+        assert n >= 0
+        assert m >= n
+        assert edps >= 0
+
+        # check for complex data type
+        cmplx = any(type(x) is ctx.mpc for x in A)
+
+        # temporarily increase the precision and initialize
+        with ctx.extradps(edps):
+            tau = ctx.matrix(n,1)
+            A = A.copy()
+
+            # ---------------
+            # FACTOR MATRIX A
+            # ---------------
+            if cmplx:
+                one = ctx.mpc('1.0', '0.0')
+                zero = ctx.mpc('0.0', '0.0')
+                rzero = ctx.mpf('0.0')
+
+                # main loop to factor A (complex)
+                for j in xrange(0, n):
+                    alpha = A[j,j]
+                    alphr = ctx.re(alpha)
+                    alphi = ctx.im(alpha)
+
+                    if (m-j) >= 2:
+                        xnorm = ctx.fsum( A[i,j]*ctx.conj(A[i,j]) for i in xrange(j+1, m) )
+                        xnorm = ctx.re( ctx.sqrt(xnorm) )
+                    else:
+                        xnorm = rzero
+
+                    if (xnorm == rzero) and (alphi == rzero):
+                        tau[j] = zero
+                        continue
+
+                    if alphr < rzero:
+                        beta = ctx.sqrt(alphr**2 + alphi**2 + xnorm**2)
+                    else:
+                        beta = -ctx.sqrt(alphr**2 + alphi**2 + xnorm**2)
+
+                    tau[j] = ctx.mpc( (beta - alphr) / beta, -alphi / beta )
+                    t = -ctx.conj(tau[j])
+                    za = one / (alpha - beta)
+
+                    for i in xrange(j+1, m):
+                        A[i,j] *= za
+
+                    A[j,j] = one
+                    for k in xrange(j+1, n):
+                        y = ctx.fsum(A[i,j] * ctx.conj(A[i,k]) for i in xrange(j, m))
+                        temp = t * ctx.conj(y)
+                        for i in xrange(j, m):
+                            A[i,k] += A[i,j] * temp
+
+                    A[j,j] = ctx.mpc(beta, '0.0')
+            else:
+                one = ctx.mpf('1.0')
+                zero = ctx.mpf('0.0')
+
+                # main loop to factor A (real)
+                for j in xrange(0, n):
+                    alpha = A[j,j]
+
+                    if (m-j) > 2:
+                        xnorm = ctx.fsum( (A[i,j])**2 for i in xrange(j+1, m) )
+                        xnorm = ctx.sqrt(xnorm)
+                    elif (m-j) == 2:
+                        xnorm = abs( A[m-1,j] )
+                    else:
+                        xnorm = zero
+
+                    if xnorm == zero:
+                        tau[j] = zero
+                        continue
+
+                    if alpha < zero:
+                        beta = ctx.sqrt(alpha**2 + xnorm**2)
+                    else:
+                        beta = -ctx.sqrt(alpha**2 + xnorm**2)
+
+                    tau[j] = (beta - alpha) / beta
+                    t = -tau[j]
+                    da = one / (alpha - beta)
+
+                    for i in xrange(j+1, m):
+                        A[i,j] *= da
+
+                    A[j,j] = one
+                    for k in xrange(j+1, n):
+                        y = ctx.fsum( A[i,j] * A[i,k] for i in xrange(j, m) )
+                        temp = t * y
+                        for i in xrange(j,m):
+                            A[i,k] += A[i,j] * temp
+
+                    A[j,j] = beta
+
+            # return factorization in same internal format as LAPACK
+            if (mode == 'raw') or (mode == 'RAW'):
+                return A, tau
+
+            # ----------------------------------
+            # FORM Q USING BACKWARD ACCUMULATION
+            # ----------------------------------
+
+            # form R before the values are overwritten
+            R = A.copy()
+            for j in xrange(0, n):
+                for i in xrange(j+1, m):
+                    R[i,j] = zero
+
+            # set the value of p (number of columns of Q to return)
+            p = m
+            if (mode == 'skinny') or (mode == 'SKINNY'):
+                p = n
+
+            # add columns to A if needed and initialize
+            A.cols += (p-n)
+            for j in xrange(0, p):
+                A[j,j] = one
+                for i in xrange(0, j):
+                    A[i,j] = zero
+
+            # main loop to form Q
+            for j in xrange(n-1, -1, -1):
+                t = -tau[j]
+                A[j,j] += t
+
+                for k in xrange(j+1, p):
+                    if cmplx:
+                        y = ctx.fsum(A[i,j] * ctx.conj(A[i,k]) for i in xrange(j+1, m))
+                        temp = t * ctx.conj(y)
+                    else:
+                        y = ctx.fsum(A[i,j] * A[i,k] for i in xrange(j+1, m))
+                        temp = t * y
+                    A[j,k] = temp
+                    for i in xrange(j+1, m):
+                        A[i,k] += A[i,j] * temp
+
+                for i in xrange(j+1, m):
+                    A[i, j] *= t
+
+            return A, R[0:p,0:n]
+
+        # ------------------
+        # END OF FUNCTION QR
+        # ------------------
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py
new file mode 100644
index 0000000000000000000000000000000000000000..a97d5a9ca7e173195386dc7cb60860a826ab6a97
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py
@@ -0,0 +1,1005 @@
+from ..libmp.backend import xrange
+import warnings
+
+# TODO: interpret list as vectors (for multiplication)
+
+rowsep = '\n'
+colsep = '  '
+
+class _matrix(object):
+    """
+    Numerical matrix.
+
+    Specify the dimensions or the data as a nested list.
+    Elements default to zero.
+    Use a flat list to create a column vector easily.
+
+    The datatype of the context (mpf for mp, mpi for iv, and float for fp) is used to store the data.
+
+    Creating matrices
+    -----------------
+
+    Matrices in mpmath are implemented using dictionaries. Only non-zero values
+    are stored, so it is cheap to represent sparse matrices.
+
+    The most basic way to create one is to use the ``matrix`` class directly.
+    You can create an empty matrix specifying the dimensions:
+
+        >>> from mpmath import *
+        >>> mp.dps = 15
+        >>> matrix(2)
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0']])
+        >>> matrix(2, 3)
+        matrix(
+        [['0.0', '0.0', '0.0'],
+         ['0.0', '0.0', '0.0']])
+
+    Calling ``matrix`` with one dimension will create a square matrix.
+
+    To access the dimensions of a matrix, use the ``rows`` or ``cols`` keyword:
+
+        >>> A = matrix(3, 2)
+        >>> A
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0'],
+         ['0.0', '0.0']])
+        >>> A.rows
+        3
+        >>> A.cols
+        2
+
+    You can also change the dimension of an existing matrix. This will set the
+    new elements to 0. If the new dimension is smaller than before, the
+    concerning elements are discarded:
+
+        >>> A.rows = 2
+        >>> A
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0']])
+
+    Internally ``mpmathify`` is used every time an element is set. This
+    is done using the syntax A[row,column], counting from 0:
+
+        >>> A = matrix(2)
+        >>> A[1,1] = 1 + 1j
+        >>> A
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', mpc(real='1.0', imag='1.0')]])
+
+    A more comfortable way to create a matrix lets you use nested lists:
+
+        >>> matrix([[1, 2], [3, 4]])
+        matrix(
+        [['1.0', '2.0'],
+         ['3.0', '4.0']])
+
+    Convenient advanced functions are available for creating various standard
+    matrices, see ``zeros``, ``ones``, ``diag``, ``eye``, ``randmatrix`` and
+    ``hilbert``.
+
+    Vectors
+    .......
+
+    Vectors may also be represented by the ``matrix`` class (with rows = 1 or cols = 1).
+    For vectors there are some things which make life easier. A column vector can
+    be created using a flat list, a row vectors using an almost flat nested list::
+
+        >>> matrix([1, 2, 3])
+        matrix(
+        [['1.0'],
+         ['2.0'],
+         ['3.0']])
+        >>> matrix([[1, 2, 3]])
+        matrix(
+        [['1.0', '2.0', '3.0']])
+
+    Optionally vectors can be accessed like lists, using only a single index::
+
+        >>> x = matrix([1, 2, 3])
+        >>> x[1]
+        mpf('2.0')
+        >>> x[1,0]
+        mpf('2.0')
+
+    Other
+    .....
+
+    Like you probably expected, matrices can be printed::
+
+        >>> print randmatrix(3) # doctest:+SKIP
+        [ 0.782963853573023  0.802057689719883  0.427895717335467]
+        [0.0541876859348597  0.708243266653103  0.615134039977379]
+        [ 0.856151514955773  0.544759264818486  0.686210904770947]
+
+    Use ``nstr`` or ``nprint`` to specify the number of digits to print::
+
+        >>> nprint(randmatrix(5), 3) # doctest:+SKIP
+        [2.07e-1  1.66e-1  5.06e-1  1.89e-1  8.29e-1]
+        [6.62e-1  6.55e-1  4.47e-1  4.82e-1  2.06e-2]
+        [4.33e-1  7.75e-1  6.93e-2  2.86e-1  5.71e-1]
+        [1.01e-1  2.53e-1  6.13e-1  3.32e-1  2.59e-1]
+        [1.56e-1  7.27e-2  6.05e-1  6.67e-2  2.79e-1]
+
+    As matrices are mutable, you will need to copy them sometimes::
+
+        >>> A = matrix(2)
+        >>> A
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0']])
+        >>> B = A.copy()
+        >>> B[0,0] = 1
+        >>> B
+        matrix(
+        [['1.0', '0.0'],
+         ['0.0', '0.0']])
+        >>> A
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0']])
+
+    Finally, it is possible to convert a matrix to a nested list. This is very useful,
+    as most Python libraries involving matrices or arrays (namely NumPy or SymPy)
+    support this format::
+
+        >>> B.tolist()
+        [[mpf('1.0'), mpf('0.0')], [mpf('0.0'), mpf('0.0')]]
+
+
+    Matrix operations
+    -----------------
+
+    You can add and subtract matrices of compatible dimensions::
+
+        >>> A = matrix([[1, 2], [3, 4]])
+        >>> B = matrix([[-2, 4], [5, 9]])
+        >>> A + B
+        matrix(
+        [['-1.0', '6.0'],
+         ['8.0', '13.0']])
+        >>> A - B
+        matrix(
+        [['3.0', '-2.0'],
+         ['-2.0', '-5.0']])
+        >>> A + ones(3) # doctest:+ELLIPSIS
+        Traceback (most recent call last):
+          ...
+        ValueError: incompatible dimensions for addition
+
+    It is possible to multiply or add matrices and scalars. In the latter case the
+    operation will be done element-wise::
+
+        >>> A * 2
+        matrix(
+        [['2.0', '4.0'],
+         ['6.0', '8.0']])
+        >>> A / 4
+        matrix(
+        [['0.25', '0.5'],
+         ['0.75', '1.0']])
+        >>> A - 1
+        matrix(
+        [['0.0', '1.0'],
+         ['2.0', '3.0']])
+
+    Of course you can perform matrix multiplication, if the dimensions are
+    compatible, using ``@`` (for Python >= 3.5) or ``*``. For clarity, ``@`` is
+    recommended (`PEP 465 <https://www.python.org/dev/peps/pep-0465/>`), because
+    the meaning of ``*`` is different in many other Python libraries such as NumPy.
+
+        >>> A @ B # doctest:+SKIP
+        matrix(
+        [['8.0', '22.0'],
+         ['14.0', '48.0']])
+        >>> A * B # same as A @ B
+        matrix(
+        [['8.0', '22.0'],
+         ['14.0', '48.0']])
+        >>> matrix([[1, 2, 3]]) * matrix([[-6], [7], [-2]])
+        matrix(
+        [['2.0']])
+
+    ..
+        COMMENT: TODO: the above "doctest:+SKIP" may be removed as soon as we
+        have dropped support for Python 3.5 and below.
+
+    You can raise powers of square matrices::
+
+        >>> A**2
+        matrix(
+        [['7.0', '10.0'],
+         ['15.0', '22.0']])
+
+    Negative powers will calculate the inverse::
+
+        >>> A**-1
+        matrix(
+        [['-2.0', '1.0'],
+         ['1.5', '-0.5']])
+        >>> A * A**-1
+        matrix(
+        [['1.0', '1.0842021724855e-19'],
+         ['-2.16840434497101e-19', '1.0']])
+
+
+
+    Matrix transposition is straightforward::
+
+        >>> A = ones(2, 3)
+        >>> A
+        matrix(
+        [['1.0', '1.0', '1.0'],
+         ['1.0', '1.0', '1.0']])
+        >>> A.T
+        matrix(
+        [['1.0', '1.0'],
+         ['1.0', '1.0'],
+         ['1.0', '1.0']])
+
+    Norms
+    .....
+
+    Sometimes you need to know how "large" a matrix or vector is. Due to their
+    multidimensional nature it's not possible to compare them, but there are
+    several functions to map a matrix or a vector to a positive real number, the
+    so called norms.
+
+    For vectors the p-norm is intended, usually the 1-, the 2- and the oo-norm are
+    used.
+
+        >>> x = matrix([-10, 2, 100])
+        >>> norm(x, 1)
+        mpf('112.0')
+        >>> norm(x, 2)
+        mpf('100.5186549850325')
+        >>> norm(x, inf)
+        mpf('100.0')
+
+    Please note that the 2-norm is the most used one, though it is more expensive
+    to calculate than the 1- or oo-norm.
+
+    It is possible to generalize some vector norms to matrix norm::
+
+        >>> A = matrix([[1, -1000], [100, 50]])
+        >>> mnorm(A, 1)
+        mpf('1050.0')
+        >>> mnorm(A, inf)
+        mpf('1001.0')
+        >>> mnorm(A, 'F')
+        mpf('1006.2310867787777')
+
+    The last norm (the "Frobenius-norm") is an approximation for the 2-norm, which
+    is hard to calculate and not available. The Frobenius-norm lacks some
+    mathematical properties you might expect from a norm.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.__data = {}
+        # LU decompostion cache, this is useful when solving the same system
+        # multiple times, when calculating the inverse and when calculating the
+        # determinant
+        self._LU = None
+        if "force_type" in kwargs:
+            warnings.warn("The force_type argument was removed, it did not work"
+                " properly anyway. If you want to force floating-point or"
+                " interval computations, use the respective methods from `fp`"
+                " or `mp` instead, e.g., `fp.matrix()` or `iv.matrix()`."
+                " If you want to truncate values to integer, use .apply(int) instead.")
+        if isinstance(args[0], (list, tuple)):
+            if isinstance(args[0][0], (list, tuple)):
+                # interpret nested list as matrix
+                A = args[0]
+                self.__rows = len(A)
+                self.__cols = len(A[0])
+                for i, row in enumerate(A):
+                    for j, a in enumerate(row):
+                        # note: this will call __setitem__ which will call self.ctx.convert() to convert the datatype.
+                        self[i, j] = a
+            else:
+                # interpret list as row vector
+                v = args[0]
+                self.__rows = len(v)
+                self.__cols = 1
+                for i, e in enumerate(v):
+                    self[i, 0] = e
+        elif isinstance(args[0], int):
+            # create empty matrix of given dimensions
+            if len(args) == 1:
+                self.__rows = self.__cols = args[0]
+            else:
+                if not isinstance(args[1], int):
+                    raise TypeError("expected int")
+                self.__rows = args[0]
+                self.__cols = args[1]
+        elif isinstance(args[0], _matrix):
+            A = args[0]
+            self.__rows = A._matrix__rows
+            self.__cols = A._matrix__cols
+            for i in xrange(A.__rows):
+                for j in xrange(A.__cols):
+                    self[i, j] = A[i, j]
+        elif hasattr(args[0], 'tolist'):
+            A = self.ctx.matrix(args[0].tolist())
+            self.__data = A._matrix__data
+            self.__rows = A._matrix__rows
+            self.__cols = A._matrix__cols
+        else:
+            raise TypeError('could not interpret given arguments')
+
+    def apply(self, f):
+        """
+        Return a copy of self with the function `f` applied elementwise.
+        """
+        new = self.ctx.matrix(self.__rows, self.__cols)
+        for i in xrange(self.__rows):
+            for j in xrange(self.__cols):
+                new[i,j] = f(self[i,j])
+        return new
+
+    def __nstr__(self, n=None, **kwargs):
+        # Build table of string representations of the elements
+        res = []
+        # Track per-column max lengths for pretty alignment
+        maxlen = [0] * self.cols
+        for i in range(self.rows):
+            res.append([])
+            for j in range(self.cols):
+                if n:
+                    string = self.ctx.nstr(self[i,j], n, **kwargs)
+                else:
+                    string = str(self[i,j])
+                res[-1].append(string)
+                maxlen[j] = max(len(string), maxlen[j])
+        # Patch strings together
+        for i, row in enumerate(res):
+            for j, elem in enumerate(row):
+                # Pad each element up to maxlen so the columns line up
+                row[j] = elem.rjust(maxlen[j])
+            res[i] = "[" + colsep.join(row) + "]"
+        return rowsep.join(res)
+
+    def __str__(self):
+        return self.__nstr__()
+
+    def _toliststr(self, avoid_type=False):
+        """
+        Create a list string from a matrix.
+
+        If avoid_type: avoid multiple 'mpf's.
+        """
+        # XXX: should be something like self.ctx._types
+        typ = self.ctx.mpf
+        s = '['
+        for i in xrange(self.__rows):
+            s += '['
+            for j in xrange(self.__cols):
+                if not avoid_type or not isinstance(self[i,j], typ):
+                    a = repr(self[i,j])
+                else:
+                    a = "'" + str(self[i,j]) + "'"
+                s += a + ', '
+            s = s[:-2]
+            s += '],\n '
+        s = s[:-3]
+        s += ']'
+        return s
+
+    def tolist(self):
+        """
+        Convert the matrix to a nested list.
+        """
+        return [[self[i,j] for j in range(self.__cols)] for i in range(self.__rows)]
+
+    def __repr__(self):
+        if self.ctx.pretty:
+            return self.__str__()
+        s = 'matrix(\n'
+        s += self._toliststr(avoid_type=True) + ')'
+        return s
+
+    def __get_element(self, key):
+        '''
+        Fast extraction of the i,j element from the matrix
+            This function is for private use only because is unsafe:
+                1. Does not check on the value of key it expects key to be a integer tuple (i,j)
+                2. Does not check bounds
+        '''
+        if key in self.__data:
+            return self.__data[key]
+        else:
+            return self.ctx.zero
+
+    def __set_element(self, key, value):
+        '''
+        Fast assignment of the i,j element in the matrix
+            This function is unsafe:
+                1. Does not check on the value of key it expects key to be a integer tuple (i,j)
+                2. Does not check bounds
+                3. Does not check the value type
+                4. Does not reset the LU cache
+        '''
+        if value: # only store non-zeros
+            self.__data[key] = value
+        elif key in self.__data:
+            del self.__data[key]
+
+
+    def __getitem__(self, key):
+        '''
+            Getitem function for mp matrix class with slice index enabled
+            it allows the following assingments
+            scalar to a slice of the matrix
+         B = A[:,2:6]
+        '''
+        # Convert vector to matrix indexing
+        if isinstance(key, int) or isinstance(key,slice):
+            # only sufficent for vectors
+            if self.__rows == 1:
+                key = (0, key)
+            elif self.__cols == 1:
+                key = (key, 0)
+            else:
+                raise IndexError('insufficient indices for matrix')
+
+        if isinstance(key[0],slice) or isinstance(key[1],slice):
+
+            #Rows
+            if isinstance(key[0],slice):
+                #Check bounds
+                if (key[0].start is None or key[0].start >= 0) and \
+                    (key[0].stop is None or key[0].stop <= self.__rows+1):
+                    # Generate indices
+                    rows = xrange(*key[0].indices(self.__rows))
+                else:
+                    raise IndexError('Row index out of bounds')
+            else:
+                # Single row
+                rows = [key[0]]
+
+            # Columns
+            if isinstance(key[1],slice):
+                # Check bounds
+                if (key[1].start is None or key[1].start >= 0) and \
+                    (key[1].stop is None or key[1].stop <= self.__cols+1):
+                    # Generate indices
+                    columns = xrange(*key[1].indices(self.__cols))
+                else:
+                    raise IndexError('Column index out of bounds')
+
+            else:
+                # Single column
+                columns = [key[1]]
+
+            # Create matrix slice
+            m = self.ctx.matrix(len(rows),len(columns))
+
+            # Assign elements to the output matrix
+            for i,x in enumerate(rows):
+                for j,y in enumerate(columns):
+                    m.__set_element((i,j),self.__get_element((x,y)))
+
+            return m
+
+        else:
+            # single element extraction
+            if key[0] >= self.__rows or key[1] >= self.__cols:
+                raise IndexError('matrix index out of range')
+            if key in self.__data:
+                return self.__data[key]
+            else:
+                return self.ctx.zero
+
+    def __setitem__(self, key, value):
+        # setitem function for mp matrix class with slice index enabled
+        # it allows the following assingments
+        #  scalar to a slice of the matrix
+        # A[:,2:6] = 2.5
+        #  submatrix to matrix (the value matrix should be the same size as the slice size)
+        # A[3,:] = B   where A is n x m  and B is n x 1
+        # Convert vector to matrix indexing
+        if isinstance(key, int) or isinstance(key,slice):
+            # only sufficent for vectors
+            if self.__rows == 1:
+                key = (0, key)
+            elif self.__cols == 1:
+                key = (key, 0)
+            else:
+                raise IndexError('insufficient indices for matrix')
+        # Slice indexing
+        if isinstance(key[0],slice) or isinstance(key[1],slice):
+            # Rows
+            if isinstance(key[0],slice):
+                # Check bounds
+                if (key[0].start is None or key[0].start >= 0) and \
+                    (key[0].stop is None or key[0].stop <= self.__rows+1):
+                    # generate row indices
+                    rows = xrange(*key[0].indices(self.__rows))
+                else:
+                    raise IndexError('Row index out of bounds')
+            else:
+                # Single row
+                rows = [key[0]]
+            # Columns
+            if isinstance(key[1],slice):
+                # Check bounds
+                if (key[1].start is None or key[1].start >= 0) and \
+                    (key[1].stop is None or key[1].stop <= self.__cols+1):
+                    # Generate column indices
+                    columns = xrange(*key[1].indices(self.__cols))
+                else:
+                    raise IndexError('Column index out of bounds')
+            else:
+                # Single column
+                columns = [key[1]]
+            # Assign slice with a scalar
+            if isinstance(value,self.ctx.matrix):
+                # Assign elements to matrix if input and output dimensions match
+                if len(rows) == value.rows and len(columns) == value.cols:
+                    for i,x in enumerate(rows):
+                        for j,y in enumerate(columns):
+                            self.__set_element((x,y), value.__get_element((i,j)))
+                else:
+                    raise ValueError('Dimensions do not match')
+            else:
+                # Assign slice with scalars
+                value = self.ctx.convert(value)
+                for i in rows:
+                    for j in columns:
+                        self.__set_element((i,j), value)
+        else:
+            # Single element assingment
+            # Check bounds
+            if key[0] >= self.__rows or key[1] >= self.__cols:
+                raise IndexError('matrix index out of range')
+            # Convert and store value
+            value = self.ctx.convert(value)
+            if value: # only store non-zeros
+                self.__data[key] = value
+            elif key in self.__data:
+                del self.__data[key]
+
+        if self._LU:
+            self._LU = None
+        return
+
+    def __iter__(self):
+        for i in xrange(self.__rows):
+            for j in xrange(self.__cols):
+                yield self[i,j]
+
+    def __mul__(self, other):
+        if isinstance(other, self.ctx.matrix):
+            # dot multiplication
+            if self.__cols != other.__rows:
+                raise ValueError('dimensions not compatible for multiplication')
+            new = self.ctx.matrix(self.__rows, other.__cols)
+            self_zero = self.ctx.zero
+            self_get = self.__data.get
+            other_zero = other.ctx.zero
+            other_get = other.__data.get
+            for i in xrange(self.__rows):
+                for j in xrange(other.__cols):
+                    new[i, j] = self.ctx.fdot((self_get((i,k), self_zero), other_get((k,j), other_zero))
+                                     for k in xrange(other.__rows))
+            return new
+        else:
+            # try scalar multiplication
+            new = self.ctx.matrix(self.__rows, self.__cols)
+            for i in xrange(self.__rows):
+                for j in xrange(self.__cols):
+                    new[i, j] = other * self[i, j]
+            return new
+
+    def __matmul__(self, other):
+        return self.__mul__(other)
+
+    def __rmul__(self, other):
+        # assume other is scalar and thus commutative
+        if isinstance(other, self.ctx.matrix):
+            raise TypeError("other should not be type of ctx.matrix")
+        return self.__mul__(other)
+
+    def __pow__(self, other):
+        # avoid cyclic import problems
+        #from linalg import inverse
+        if not isinstance(other, int):
+            raise ValueError('only integer exponents are supported')
+        if not self.__rows == self.__cols:
+            raise ValueError('only powers of square matrices are defined')
+        n = other
+        if n == 0:
+            return self.ctx.eye(self.__rows)
+        if n < 0:
+            n = -n
+            neg = True
+        else:
+            neg = False
+        i = n
+        y = 1
+        z = self.copy()
+        while i != 0:
+            if i % 2 == 1:
+                y = y * z
+            z = z*z
+            i = i // 2
+        if neg:
+            y = self.ctx.inverse(y)
+        return y
+
+    def __div__(self, other):
+        # assume other is scalar and do element-wise divison
+        assert not isinstance(other, self.ctx.matrix)
+        new = self.ctx.matrix(self.__rows, self.__cols)
+        for i in xrange(self.__rows):
+            for j in xrange(self.__cols):
+                new[i,j] = self[i,j] / other
+        return new
+
+    __truediv__ = __div__
+
+    def __add__(self, other):
+        if isinstance(other, self.ctx.matrix):
+            if not (self.__rows == other.__rows and self.__cols == other.__cols):
+                raise ValueError('incompatible dimensions for addition')
+            new = self.ctx.matrix(self.__rows, self.__cols)
+            for i in xrange(self.__rows):
+                for j in xrange(self.__cols):
+                    new[i,j] = self[i,j] + other[i,j]
+            return new
+        else:
+            # assume other is scalar and add element-wise
+            new = self.ctx.matrix(self.__rows, self.__cols)
+            for i in xrange(self.__rows):
+                for j in xrange(self.__cols):
+                    new[i,j] += self[i,j] + other
+            return new
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, self.ctx.matrix) and not (self.__rows == other.__rows
+                                              and self.__cols == other.__cols):
+            raise ValueError('incompatible dimensions for subtraction')
+        return self.__add__(other * (-1))
+
+    def __pos__(self):
+        """
+        +M returns a copy of M, rounded to current working precision.
+        """
+        return (+1) * self
+
+    def __neg__(self):
+        return (-1) * self
+
+    def __rsub__(self, other):
+        return -self + other
+
+    def __eq__(self, other):
+        return self.__rows == other.__rows and self.__cols == other.__cols \
+               and self.__data == other.__data
+
+    def __len__(self):
+        if self.rows == 1:
+            return self.cols
+        elif self.cols == 1:
+            return self.rows
+        else:
+            return self.rows # do it like numpy
+
+    def __getrows(self):
+        return self.__rows
+
+    def __setrows(self, value):
+        for key in self.__data.copy():
+            if key[0] >= value:
+                del self.__data[key]
+        self.__rows = value
+
+    rows = property(__getrows, __setrows, doc='number of rows')
+
+    def __getcols(self):
+        return self.__cols
+
+    def __setcols(self, value):
+        for key in self.__data.copy():
+            if key[1] >= value:
+                del self.__data[key]
+        self.__cols = value
+
+    cols = property(__getcols, __setcols, doc='number of columns')
+
+    def transpose(self):
+        new = self.ctx.matrix(self.__cols, self.__rows)
+        for i in xrange(self.__rows):
+            for j in xrange(self.__cols):
+                new[j,i] = self[i,j]
+        return new
+
+    T = property(transpose)
+
+    def conjugate(self):
+        return self.apply(self.ctx.conj)
+
+    def transpose_conj(self):
+        return self.conjugate().transpose()
+
+    H = property(transpose_conj)
+
+    def copy(self):
+        new = self.ctx.matrix(self.__rows, self.__cols)
+        new.__data = self.__data.copy()
+        return new
+
+    __copy__ = copy
+
+    def column(self, n):
+        m = self.ctx.matrix(self.rows, 1)
+        for i in range(self.rows):
+            m[i] = self[i,n]
+        return m
+
+class MatrixMethods(object):
+
+    def __init__(ctx):
+        # XXX: subclass
+        ctx.matrix = type('matrix', (_matrix,), {})
+        ctx.matrix.ctx = ctx
+        ctx.matrix.convert = ctx.convert
+
+    def eye(ctx, n, **kwargs):
+        """
+        Create square identity matrix n x n.
+        """
+        A = ctx.matrix(n, **kwargs)
+        for i in xrange(n):
+            A[i,i] = 1
+        return A
+
+    def diag(ctx, diagonal, **kwargs):
+        """
+        Create square diagonal matrix using given list.
+
+        Example:
+        >>> from mpmath import diag, mp
+        >>> mp.pretty = False
+        >>> diag([1, 2, 3])
+        matrix(
+        [['1.0', '0.0', '0.0'],
+         ['0.0', '2.0', '0.0'],
+         ['0.0', '0.0', '3.0']])
+        """
+        A = ctx.matrix(len(diagonal), **kwargs)
+        for i in xrange(len(diagonal)):
+            A[i,i] = diagonal[i]
+        return A
+
+    def zeros(ctx, *args, **kwargs):
+        """
+        Create matrix m x n filled with zeros.
+        One given dimension will create square matrix n x n.
+
+        Example:
+        >>> from mpmath import zeros, mp
+        >>> mp.pretty = False
+        >>> zeros(2)
+        matrix(
+        [['0.0', '0.0'],
+         ['0.0', '0.0']])
+        """
+        if len(args) == 1:
+            m = n = args[0]
+        elif len(args) == 2:
+            m = args[0]
+            n = args[1]
+        else:
+            raise TypeError('zeros expected at most 2 arguments, got %i' % len(args))
+        A = ctx.matrix(m, n, **kwargs)
+        for i in xrange(m):
+            for j in xrange(n):
+                A[i,j] = 0
+        return A
+
+    def ones(ctx, *args, **kwargs):
+        """
+        Create matrix m x n filled with ones.
+        One given dimension will create square matrix n x n.
+
+        Example:
+        >>> from mpmath import ones, mp
+        >>> mp.pretty = False
+        >>> ones(2)
+        matrix(
+        [['1.0', '1.0'],
+         ['1.0', '1.0']])
+        """
+        if len(args) == 1:
+            m = n = args[0]
+        elif len(args) == 2:
+            m = args[0]
+            n = args[1]
+        else:
+            raise TypeError('ones expected at most 2 arguments, got %i' % len(args))
+        A = ctx.matrix(m, n, **kwargs)
+        for i in xrange(m):
+            for j in xrange(n):
+                A[i,j] = 1
+        return A
+
+    def hilbert(ctx, m, n=None):
+        """
+        Create (pseudo) hilbert matrix m x n.
+        One given dimension will create hilbert matrix n x n.
+
+        The matrix is very ill-conditioned and symmetric, positive definite if
+        square.
+        """
+        if n is None:
+            n = m
+        A = ctx.matrix(m, n)
+        for i in xrange(m):
+            for j in xrange(n):
+                A[i,j] = ctx.one / (i + j + 1)
+        return A
+
+    def randmatrix(ctx, m, n=None, min=0, max=1, **kwargs):
+        """
+        Create a random m x n matrix.
+
+        All values are >= min and <max.
+        n defaults to m.
+
+        Example:
+        >>> from mpmath import randmatrix
+        >>> randmatrix(2) # doctest:+SKIP
+        matrix(
+        [['0.53491598236191806', '0.57195669543302752'],
+         ['0.85589992269513615', '0.82444367501382143']])
+        """
+        if not n:
+            n = m
+        A = ctx.matrix(m, n, **kwargs)
+        for i in xrange(m):
+            for j in xrange(n):
+                A[i,j] = ctx.rand() * (max - min) + min
+        return A
+
+    def swap_row(ctx, A, i, j):
+        """
+        Swap row i with row j.
+        """
+        if i == j:
+            return
+        if isinstance(A, ctx.matrix):
+            for k in xrange(A.cols):
+                A[i,k], A[j,k] = A[j,k], A[i,k]
+        elif isinstance(A, list):
+            A[i], A[j] = A[j], A[i]
+        else:
+            raise TypeError('could not interpret type')
+
+    def extend(ctx, A, b):
+        """
+        Extend matrix A with column b and return result.
+        """
+        if not isinstance(A, ctx.matrix):
+            raise TypeError("A should be a type of ctx.matrix")
+        if A.rows != len(b):
+            raise ValueError("Value should be equal to len(b)")
+        A = A.copy()
+        A.cols += 1
+        for i in xrange(A.rows):
+            A[i, A.cols-1] = b[i]
+        return A
+
+    def norm(ctx, x, p=2):
+        r"""
+        Gives the entrywise `p`-norm of an iterable *x*, i.e. the vector norm
+        `\left(\sum_k |x_k|^p\right)^{1/p}`, for any given `1 \le p \le \infty`.
+
+        Special cases:
+
+        If *x* is not iterable, this just returns ``absmax(x)``.
+
+        ``p=1`` gives the sum of absolute values.
+
+        ``p=2`` is the standard Euclidean vector norm.
+
+        ``p=inf`` gives the magnitude of the largest element.
+
+        For *x* a matrix, ``p=2`` is the Frobenius norm.
+        For operator matrix norms, use :func:`~mpmath.mnorm` instead.
+
+        You can use the string 'inf' as well as float('inf') or mpf('inf')
+        to specify the infinity norm.
+
+        **Examples**
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = False
+            >>> x = matrix([-10, 2, 100])
+            >>> norm(x, 1)
+            mpf('112.0')
+            >>> norm(x, 2)
+            mpf('100.5186549850325')
+            >>> norm(x, inf)
+            mpf('100.0')
+
+        """
+        try:
+            iter(x)
+        except TypeError:
+            return ctx.absmax(x)
+        if type(p) is not int:
+            p = ctx.convert(p)
+        if p == ctx.inf:
+            return max(ctx.absmax(i) for i in x)
+        elif p == 1:
+            return ctx.fsum(x, absolute=1)
+        elif p == 2:
+            return ctx.sqrt(ctx.fsum(x, absolute=1, squared=1))
+        elif p > 1:
+            return ctx.nthroot(ctx.fsum(abs(i)**p for i in x), p)
+        else:
+            raise ValueError('p has to be >= 1')
+
+    def mnorm(ctx, A, p=1):
+        r"""
+        Gives the matrix (operator) `p`-norm of A. Currently ``p=1`` and ``p=inf``
+        are supported:
+
+        ``p=1`` gives the 1-norm (maximal column sum)
+
+        ``p=inf`` gives the `\infty`-norm (maximal row sum).
+        You can use the string 'inf' as well as float('inf') or mpf('inf')
+
+        ``p=2`` (not implemented) for a square matrix is the usual spectral
+        matrix norm, i.e. the largest singular value.
+
+        ``p='f'`` (or 'F', 'fro', 'Frobenius, 'frobenius') gives the
+        Frobenius norm, which is the elementwise 2-norm. The Frobenius norm is an
+        approximation of the spectral norm and satisfies
+
+        .. math ::
+
+            \frac{1}{\sqrt{\mathrm{rank}(A)}} \|A\|_F \le \|A\|_2 \le \|A\|_F
+
+        The Frobenius norm lacks some mathematical properties that might
+        be expected of a norm.
+
+        For general elementwise `p`-norms, use :func:`~mpmath.norm` instead.
+
+        **Examples**
+
+            >>> from mpmath import *
+            >>> mp.dps = 15; mp.pretty = False
+            >>> A = matrix([[1, -1000], [100, 50]])
+            >>> mnorm(A, 1)
+            mpf('1050.0')
+            >>> mnorm(A, inf)
+            mpf('1001.0')
+            >>> mnorm(A, 'F')
+            mpf('1006.2310867787777')
+
+        """
+        A = ctx.matrix(A)
+        if type(p) is not int:
+            if type(p) is str and 'frobenius'.startswith(p.lower()):
+                return ctx.norm(A, 2)
+            p = ctx.convert(p)
+        m, n = A.rows, A.cols
+        if p == 1:
+            return max(ctx.fsum((A[i,j] for i in xrange(m)), absolute=1) for j in xrange(n))
+        elif p == ctx.inf:
+            return max(ctx.fsum((A[i,j] for j in xrange(n)), absolute=1) for i in xrange(m))
+        else:
+            raise NotImplementedError("matrix p-norm for arbitrary p")
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..526ca48e4bc936b00dc167e834fc25cccf3538b3
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8ff30982e43b5f3841a27f842465ae57888dbde
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b1210242495c6d4e0fa26e62583832a65004f9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h
@@ -0,0 +1,1853 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#pragma once
+
+#ifndef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+#endif
+
+#include <cublas_api.h>
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/** Opaque structure holding CUBLASLT context
+ */
+typedef struct cublasLtContext* cublasLtHandle_t;
+
+cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t* lightHandle);
+
+cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle);
+
+const char* CUBLASWINAPI cublasLtGetStatusName(cublasStatus_t status);
+
+const char* CUBLASWINAPI cublasLtGetStatusString(cublasStatus_t status);
+
+size_t CUBLASWINAPI cublasLtGetVersion(void);
+
+size_t CUBLASWINAPI cublasLtGetCudartVersion(void);
+
+cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type, int* value);
+
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheGetCapacity(size_t* capacity);
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheSetCapacity(size_t capacity);
+
+/** Semi-opaque descriptor for matrix memory layout
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixLayoutOpaque_t;
+
+/** Opaque descriptor for matrix memory layout
+ */
+typedef cublasLtMatrixLayoutOpaque_t* cublasLtMatrixLayout_t;
+
+/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes)
+ *
+ * This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save
+ * on selecting the right configuration again.
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatmulAlgo_t;
+
+/** Semi-opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef struct {
+  uint64_t data[23];
+} cublasLtMatmulDescOpaque_t;
+
+/** Opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef cublasLtMatmulDescOpaque_t* cublasLtMatmulDesc_t;
+
+/** Semi-opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixTransformDescOpaque_t;
+
+/** Opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef cublasLtMatrixTransformDescOpaque_t* cublasLtMatrixTransformDesc_t;
+
+/** Semi-opaque descriptor for cublasLtMatmulPreference() operation details
+ */
+typedef struct {
+  uint64_t data[10];
+} cublasLtMatmulPreferenceOpaque_t;
+
+/** Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration
+ */
+typedef cublasLtMatmulPreferenceOpaque_t* cublasLtMatmulPreference_t;
+
+/** Tile size (in C/D matrix Rows x Cols)
+ *
+ * General order of tile IDs is sorted by size first and by first dimension second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_TILE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_TILE_8x8 = 1,
+  CUBLASLT_MATMUL_TILE_8x16 = 2,
+  CUBLASLT_MATMUL_TILE_16x8 = 3,
+  CUBLASLT_MATMUL_TILE_8x32 = 4,
+  CUBLASLT_MATMUL_TILE_16x16 = 5,
+  CUBLASLT_MATMUL_TILE_32x8 = 6,
+  CUBLASLT_MATMUL_TILE_8x64 = 7,
+  CUBLASLT_MATMUL_TILE_16x32 = 8,
+  CUBLASLT_MATMUL_TILE_32x16 = 9,
+  CUBLASLT_MATMUL_TILE_64x8 = 10,
+  CUBLASLT_MATMUL_TILE_32x32 = 11,
+  CUBLASLT_MATMUL_TILE_32x64 = 12,
+  CUBLASLT_MATMUL_TILE_64x32 = 13,
+  CUBLASLT_MATMUL_TILE_32x128 = 14,
+  CUBLASLT_MATMUL_TILE_64x64 = 15,
+  CUBLASLT_MATMUL_TILE_128x32 = 16,
+  CUBLASLT_MATMUL_TILE_64x128 = 17,
+  CUBLASLT_MATMUL_TILE_128x64 = 18,
+  CUBLASLT_MATMUL_TILE_64x256 = 19,
+  CUBLASLT_MATMUL_TILE_128x128 = 20,
+  CUBLASLT_MATMUL_TILE_256x64 = 21,
+  CUBLASLT_MATMUL_TILE_64x512 = 22,
+  CUBLASLT_MATMUL_TILE_128x256 = 23,
+  CUBLASLT_MATMUL_TILE_256x128 = 24,
+  CUBLASLT_MATMUL_TILE_512x64 = 25,
+  CUBLASLT_MATMUL_TILE_64x96 = 26,
+  CUBLASLT_MATMUL_TILE_96x64 = 27,
+  CUBLASLT_MATMUL_TILE_96x128 = 28,
+  CUBLASLT_MATMUL_TILE_128x160 = 29,
+  CUBLASLT_MATMUL_TILE_160x128 = 30,
+  CUBLASLT_MATMUL_TILE_192x128 = 31,
+  CUBLASLT_MATMUL_TILE_128x192 = 32,
+  CUBLASLT_MATMUL_TILE_128x96 = 33,
+  CUBLASLT_MATMUL_TILE_END
+} cublasLtMatmulTile_t;
+
+/** Size and number of stages in which elements are read into shared memory
+ *
+ * General order of stages IDs is sorted by stage size first and by number of stages second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_STAGES_UNDEFINED = 0,
+  CUBLASLT_MATMUL_STAGES_16x1 = 1,
+  CUBLASLT_MATMUL_STAGES_16x2 = 2,
+  CUBLASLT_MATMUL_STAGES_16x3 = 3,
+  CUBLASLT_MATMUL_STAGES_16x4 = 4,
+  CUBLASLT_MATMUL_STAGES_16x5 = 5,
+  CUBLASLT_MATMUL_STAGES_16x6 = 6,
+  CUBLASLT_MATMUL_STAGES_32x1 = 7,
+  CUBLASLT_MATMUL_STAGES_32x2 = 8,
+  CUBLASLT_MATMUL_STAGES_32x3 = 9,
+  CUBLASLT_MATMUL_STAGES_32x4 = 10,
+  CUBLASLT_MATMUL_STAGES_32x5 = 11,
+  CUBLASLT_MATMUL_STAGES_32x6 = 12,
+  CUBLASLT_MATMUL_STAGES_64x1 = 13,
+  CUBLASLT_MATMUL_STAGES_64x2 = 14,
+  CUBLASLT_MATMUL_STAGES_64x3 = 15,
+  CUBLASLT_MATMUL_STAGES_64x4 = 16,
+  CUBLASLT_MATMUL_STAGES_64x5 = 17,
+  CUBLASLT_MATMUL_STAGES_64x6 = 18,
+  CUBLASLT_MATMUL_STAGES_128x1 = 19,
+  CUBLASLT_MATMUL_STAGES_128x2 = 20,
+  CUBLASLT_MATMUL_STAGES_128x3 = 21,
+  CUBLASLT_MATMUL_STAGES_128x4 = 22,
+  CUBLASLT_MATMUL_STAGES_128x5 = 23,
+  CUBLASLT_MATMUL_STAGES_128x6 = 24,
+  CUBLASLT_MATMUL_STAGES_32x10 = 25,
+  CUBLASLT_MATMUL_STAGES_8x4 = 26,
+  CUBLASLT_MATMUL_STAGES_16x10 = 27,
+  CUBLASLT_MATMUL_STAGES_8x5 = 28,
+  CUBLASLT_MATMUL_STAGES_16x80 = 29,
+  CUBLASLT_MATMUL_STAGES_64x80 = 30,
+  CUBLASLT_MATMUL_STAGES_8x3 = 31,
+  CUBLASLT_MATMUL_STAGES_8xAUTO = 32,
+  CUBLASLT_MATMUL_STAGES_16xAUTO = 33,
+  CUBLASLT_MATMUL_STAGES_32xAUTO = 34,
+  CUBLASLT_MATMUL_STAGES_64xAUTO = 35,
+  CUBLASLT_MATMUL_STAGES_128xAUTO = 36,
+  CUBLASLT_MATMUL_STAGES_END
+} cublasLtMatmulStages_t;
+
+/** Thread Block Cluster size
+ *
+ * Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time.
+ */
+typedef enum {
+  /** Let library pick cluster shape automatically */
+  CUBLASLT_CLUSTER_SHAPE_AUTO = 0,
+  CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2,
+  CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3,
+  CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4,
+  CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5,
+  CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6,
+  CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7,
+  CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8,
+  CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9,
+  CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10,
+  CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11,
+  CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12,
+  CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13,
+  CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14,
+  CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15,
+  CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16,
+  CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17,
+  CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18,
+  CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19,
+  CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20,
+  CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21,
+  CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22,
+  CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23,
+  CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24,
+  CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25,
+  CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26,
+  CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27,
+  CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28,
+  CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29,
+  CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30,
+  CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31,
+  CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32,
+  CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33,
+  CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34,
+  CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35,
+  CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36,
+  CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37,
+  CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38,
+  CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39,
+  CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40,
+  CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41,
+  CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42,
+  CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43,
+  CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44,
+  CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45,
+  CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46,
+  CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47,
+  CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48,
+  CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49,
+  CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50,
+  CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51,
+  CUBLASLT_CLUSTER_SHAPE_END
+} cublasLtClusterShape_t;
+
+/** Inner size of the kernel
+ *
+ * Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle
+ * effects.
+ *
+ */
+typedef enum {
+  CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4,
+  CUBLASLT_MATMUL_INNER_SHAPE_END
+} cublasLtMatmulInnerShape_t;
+
+/** Pointer mode to use for alpha/beta */
+typedef enum {
+  /** matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory */
+  CUBLASLT_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST,
+  /** matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory */
+  CUBLASLT_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE,
+  /** pointer targets an array in device memory */
+  CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2,
+  /** alpha pointer targets an array in device memory, beta is zero. Note:
+     CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3,
+  /** alpha pointer targets an array in device memory, beta is a single value in host memory. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4,
+} cublasLtPointerMode_t;
+
+/** Mask to define and query pointer mode capability */
+typedef enum {
+  /** no initial filtering is performed when querying pointer mode capabilities, will use gemm pointer mode defined in
+     operation description **/
+  CUBLASLT_POINTER_MODE_MASK_NO_FILTERING = 0,
+  /** see CUBLASLT_POINTER_MODE_HOST */
+  CUBLASLT_POINTER_MODE_MASK_HOST = 1,
+  /** see CUBLASLT_POINTER_MODE_DEVICE */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE = 2,
+  /** see CUBLASLT_POINTER_MODE_DEVICE_VECTOR */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16,
+} cublasLtPointerModeMask_t;
+
+/** Implementation details that may affect numerical behavior of algorithms. */
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA (0x01ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA (0x02ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA (0x04ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA (0x08ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK (0xfeull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK (0xffull << 0)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F (0x01ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F (0x02ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F (0x04ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I (0x08ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK (0xffull << 8)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F (0x01ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF (0x02ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32 (0x04ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F (0x08ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F (0x10ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I (0x20ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3 (0x40ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2 (0x80ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK (0xffull << 16)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN (0x01ull << 32)
+typedef uint64_t cublasLtNumericalImplFlags_t;
+
+/** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when workspaceSizeInBytes is less than workspace required by configured
+ *                                             algo
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmul(cublasLtHandle_t lightHandle,
+                                           cublasLtMatmulDesc_t computeDesc,
+                                           const void* alpha, /* host or device pointer */
+                                           const void* A,
+                                           cublasLtMatrixLayout_t Adesc,
+                                           const void* B,
+                                           cublasLtMatrixLayout_t Bdesc,
+                                           const void* beta, /* host or device pointer */
+                                           const void* C,
+                                           cublasLtMatrixLayout_t Cdesc,
+                                           void* D,
+                                           cublasLtMatrixLayout_t Ddesc,
+                                           const cublasLtMatmulAlgo_t* algo,
+                                           void* workspace,
+                                           size_t workspaceSizeInBytes,
+                                           cudaStream_t stream);
+
+/** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
+ *
+ * Can be used to change memory order of data or to scale and shift the values.
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when A is not NULL, but Adesc is NULL
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(cublasLtHandle_t lightHandle,
+                                                    cublasLtMatrixTransformDesc_t transformDesc,
+                                                    const void* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cublasLtMatrixLayout_t Adesc,
+                                                    const void* beta, /* host or device pointer */
+                                                    const void* B,
+                                                    cublasLtMatrixLayout_t Bdesc,
+                                                    void* C,
+                                                    cublasLtMatrixLayout_t Cdesc,
+                                                    cudaStream_t stream);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixLayout_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Enum for data ordering */
+typedef enum {
+  /** Column-major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next column in memory.
+   */
+  CUBLASLT_ORDER_COL = 0,
+  /** Row major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next row in memory.
+   */
+  CUBLASLT_ORDER_ROW = 1,
+  /** Column-major ordered tiles of 32 columns.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33
+   * columns and 2 rows, ld must be at least (32) * 2 = 64.
+   */
+  CUBLASLT_ORDER_COL32 = 2,
+  /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved
+   * inner tiles of 4 columns within 4 even or odd rows in an alternating pattern.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256.
+   */
+  CUBLASLT_ORDER_COL4_4R2_8C = 3,
+  /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows.
+   * Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024.
+   */
+  CUBLASLT_ORDER_COL32_2R_4R4 = 4,
+
+} cublasLtOrder_t;
+
+/** Attributes of memory layout */
+typedef enum {
+  /** Data type, see cudaDataType.
+   *
+   * uint32_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_TYPE = 0,
+
+  /** Memory order of the data, see cublasLtOrder_t.
+   *
+   * int32_t, default: CUBLASLT_ORDER_COL
+   */
+  CUBLASLT_MATRIX_LAYOUT_ORDER = 1,
+
+  /** Number of rows.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_ROWS = 2,
+
+  /** Number of columns.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_COLS = 3,
+
+  /** Matrix leading dimension.
+   *
+   * For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for
+   * other memory orders see documentation for cublasLtOrder_t values.
+   *
+   * Currently only non-negative values are supported, must be large enough so that matrix memory locations are not
+   * overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL).
+   *
+   * int64_t;
+   */
+  CUBLASLT_MATRIX_LAYOUT_LD = 4,
+
+  /** Number of matmul operations to perform in the batch.
+   *
+   * See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5,
+
+  /** Stride (in elements) to the next matrix for strided batch operation.
+   *
+   * When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride
+   * is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F,
+   * offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices
+   * is a 2B (16bit) floating point type).
+   *
+   * NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix
+   * as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride
+   * value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B
+   * each). This behavior is expected to be corrected in the next major cuBLAS version.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6,
+
+  /** Stride (in bytes) to the imaginary plane for planar complex layout.
+   *
+   * int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved
+   * in memory in each element)
+   */
+  CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7,
+} cublasLtMatrixLayoutAttribute_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
+    cublasLtMatrixLayout_t matLayout,
+    size_t size,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+
+/** Initialize matrix layout descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixLayoutInit(
+    cublasLtMatrixLayout_t matLayout, cudaDataType type, uint64_t rows, uint64_t cols, int64_t ld) {
+  return cublasLtMatrixLayoutInit_internal(matLayout, sizeof(*matLayout), type, rows, cols, ld);
+}
+
+/** Create new matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
+    cublasLtMatrixLayout_t* matLayout,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+
+/** Destroy matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout);
+
+/** Set matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatmulDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Matmul descriptor attributes to define details of the operation. */
+typedef enum {
+  /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the
+   * accumulator during matrix multiplication.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0,
+
+  /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are
+   * typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix
+   * D before being stored in memory.
+   *
+   * int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE
+   */
+  CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1,
+
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use,
+   * alpha/beta vector lenghts must match number of output matrix rows.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATMUL_DESC_POINTER_MODE = 2,
+
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSA = 3,
+
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSB = 4,
+
+  /** Transform of matrix C, see cublasOperation_t.
+   *
+   * Currently only CUBLAS_OP_N is supported.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSC = 5,
+
+  /** Matrix fill mode, see cublasFillMode_t.
+   *
+   * int32_t, default: CUBLAS_FILL_MODE_FULL
+   */
+  CUBLASLT_MATMUL_DESC_FILL_MODE = 6,
+
+  /** Epilogue function, see cublasLtEpilogue_t.
+   *
+   * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE = 7,
+
+  /** Bias or bias gradient vector pointer in the device memory.
+   *
+   * Bias case. See CUBLASLT_EPILOGUE_BIAS.
+   * For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE.
+   *
+   * Bias vector length must match matrix D rows count.
+   *
+   * Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD.
+   * Bias gradient vector elements are the same type as the output elements
+   * (Ctype) with the exception of IMMA kernels (see above).
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Bias case: const void *, default: NULL
+   * Bias gradient case: void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8,
+
+  /** Batch stride for bias or bias gradient vector.
+   *
+   * Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10,
+
+  /** Pointer for epilogue auxiliary buffer.
+   *
+   * - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX
+   *   or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used.
+   * - Input vector for ReLu bit-mask in backward pass when
+   *   CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used.
+   *
+   * - Output of GELU input matrix in forward pass when
+   *   CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used.
+   * - Input of GELU input matrix for backward pass when
+   *   CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used.
+   *
+   * For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE.
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute.
+   *
+   * Forward pass: void *, default: NULL
+   * Backward pass: const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11,
+
+  /** Leading dimension for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix leading dimension in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128 and be no less than the number of rows in the output matrix.
+   *
+   * - GELU input matrix leading dimension in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8 and be no less than the number of rows in the output matrix.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12,
+
+  /** Batch stride for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix batch stride in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128.
+   *
+   * - GELU input matrix batch stride in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13,
+
+  /** Batch stride for alpha vector.
+   *
+   * Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's
+   * CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then
+   * CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14,
+
+  /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
+   *  when user expects a concurrent stream to be using some of the device resources.
+   *
+   *  int32_t, default: 0 - use the number reported by the device.
+   */
+  CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15,
+
+  /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17,
+
+  /** Device pointer to the scale factor value to convert data in matrix B to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18,
+
+  /** Device pointer to the scale factor value to convert data in matrix C to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19,
+
+  /** Device pointer to the scale factor value to convert data in matrix D to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20,
+
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  output matrix.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21,
+
+  /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details
+   * below.
+   *
+   *  ReLu uses a bit-mask.
+   *
+   *  GELU input matrix elements type is the same as the type of elements of
+   *  the output matrix with some exceptions, see details below.
+   *
+   *  For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some
+   *  restrictions.  See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22,
+
+  /** Device pointer to the scaling factor value to convert results from compute type data range to storage
+   *  data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data,
+   *  scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23,
+
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24,
+
+  /** Flag for managing fp8 fast accumulation mode.
+   *  When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results
+   *  will not periodically be promoted to a higher precision.
+   *
+   *  int8_t, default: 0 - fast accumulation mode is disabled.
+   */
+  CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25,
+
+  /** Type of bias or bias gradient vector in the device memory.
+   *
+   * Bias case: see CUBLASLT_EPILOGUE_BIAS.
+   *
+   * Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions:
+   * - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements
+   *   are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F)
+   * - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See
+   *   https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details.
+   *
+   * int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26,
+} cublasLtMatmulDescAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    size_t size,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType);
+
+/** Initialize matmul operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was initialized successfully
+ */
+static inline cublasStatus_t cublasLtMatmulDescInit(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType) {
+  return cublasLtMatmulDescInit_internal(matmulDesc, sizeof(*matmulDesc), computeType, scaleType);
+}
+
+/** Create new matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(cublasLtMatmulDesc_t* matmulDesc,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType);
+
+/** Destroy matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc);
+
+/** Set matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixTransformDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Matrix transform descriptor attributes to define details of the operation.
+ */
+typedef enum {
+  /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then
+   * converted to output type to store in memory.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
+
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
+
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
+
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB,
+} cublasLtMatrixTransformDescAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc,
+                                                                     size_t size,
+                                                                     cudaDataType scaleType);
+
+/** Initialize matrix transform operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixTransformDescInit(cublasLtMatrixTransformDesc_t transformDesc,
+                                                             cudaDataType scaleType) {
+  return cublasLtMatrixTransformDescInit_internal(transformDesc, sizeof(*transformDesc), scaleType);
+}
+
+/** Create new matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t* transformDesc,
+                                                              cudaDataType scaleType);
+
+/** Destroy matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc);
+
+/** Set matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[in]  buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[out] buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
+ * of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/** For computation with complex numbers, this enum allows to apply the Gauss Complexity reduction algorithm
+ */
+typedef enum {
+  CUBLASLT_3M_MODE_DISALLOWED = 0,
+  CUBLASLT_3M_MODE_ALLOWED = 1,
+} cublasLt3mMode_t;
+
+/** Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K").
+ */
+typedef enum {
+  /** No reduction scheme, dot-product shall be performed in one sequence.
+   */
+  CUBLASLT_REDUCTION_SCHEME_NONE = 0,
+
+  /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to
+   * guarantee the sequentiality.
+   */
+  CUBLASLT_REDUCTION_SCHEME_INPLACE = 1,
+
+  /** Intermediate results are stored in compute type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2,
+
+  /** Intermediate results are stored in output type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4,
+
+  CUBLASLT_REDUCTION_SCHEME_MASK = 0x7,
+} cublasLtReductionScheme_t;
+
+/** Postprocessing options for the epilogue
+ */
+typedef enum {
+  /** No special postprocessing, just scale and quantize results if necessary.
+   */
+  CUBLASLT_EPILOGUE_DEFAULT = 1,
+
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   */
+  CUBLASLT_EPILOGUE_RELU = 2,
+
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX = (CUBLASLT_EPILOGUE_RELU | 128),
+
+  /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed
+   * (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final
+   * postprocessing.
+   */
+  CUBLASLT_EPILOGUE_BIAS = 4,
+
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   */
+  CUBLASLT_EPILOGUE_RELU_BIAS = (CUBLASLT_EPILOGUE_RELU | CUBLASLT_EPILOGUE_BIAS),
+
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX_BIAS = (CUBLASLT_EPILOGUE_RELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+
+  /* ReLu gradient. Apply ReLu gradient to matmul output. Store ReLu gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU = 8 | 128,
+
+  /* ReLu and Bias gradients. Apply independently ReLu and Bias gradient to
+   * matmul output. Store ReLu gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU_BGRAD = CUBLASLT_EPILOGUE_DRELU | 16,
+
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   */
+  CUBLASLT_EPILOGUE_GELU = 32,
+
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX = (CUBLASLT_EPILOGUE_GELU | 128),
+
+  /** GELU and Bias, apply Bias and then GELU transform
+   */
+  CUBLASLT_EPILOGUE_GELU_BIAS = (CUBLASLT_EPILOGUE_GELU | CUBLASLT_EPILOGUE_BIAS),
+
+  /** GELU and Bias, apply Bias and then GELU transform
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX_BIAS = (CUBLASLT_EPILOGUE_GELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+
+  /* GELU gradient. Apply GELU gradient to matmul output. Store GELU gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU = 64 | 128,
+
+  /* GELU and Bias gradients. Apply independently GELU and Bias gradient to
+   * matmul output. Store GELU gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU_BGRAD = CUBLASLT_EPILOGUE_DGELU | 16,
+
+  /** Bias gradient based on the input matrix A.
+   *
+   * The bias size corresponds to the number of rows of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADA = 256,
+
+  /** Bias gradient based on the input matrix B.
+   *
+   * The bias size corresponds to the number of columns of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADB = 512,
+} cublasLtEpilogue_t;
+
+/** Matmul heuristic search mode
+ */
+typedef enum {
+  /** ask heuristics for best algo for given usecase
+   */
+  CUBLASLT_SEARCH_BEST_FIT = 0,
+  /** only try to find best config for preconfigured algo id
+   */
+  CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_02 = 2,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_03 = 3,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_04 = 4,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_05 = 5,
+} cublasLtMatmulSearch_t;
+
+/** Algo search preference to fine tune the heuristic function. */
+typedef enum {
+  /** Search mode, see cublasLtMatmulSearch_t.
+   *
+   * uint32_t, default: CUBLASLT_SEARCH_BEST_FIT
+   */
+  CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0,
+
+  /** Maximum allowed workspace size in bytes.
+   *
+   * uint64_t, default: 0 - no workspace allowed
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1,
+
+  /** Math mode mask, see cublasMath_t.
+   *
+   * Only algorithms with CUBLASLT_ALGO_CAP_MATHMODE_IMPL that is not masked out by this attribute are allowed.
+   *
+   * uint32_t, default: 1 (allows both default and tensor op math)
+   * DEPRECATED, will be removed in a future release, see cublasLtNumericalImplFlags_t for replacement
+   */
+  CUBLASLT_MATMUL_PREF_MATH_MODE_MASK = 2,
+
+  /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that
+   * use one of the required modes.
+   *
+   * E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes.
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes)
+   */
+  CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3,
+
+  /** Gaussian mode mask, see cublasLt3mMode_t.
+   *
+   * Only algorithms with CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL that is not masked out by this attribute are allowed.
+   *
+   * uint32_t, default: CUBLASLT_3M_MODE_ALLOWED (allows both gaussian and non-gaussian algorithms)
+   * DEPRECATED, will be removed in a future release, see cublasLtNumericalImplFlags_t for replacement
+   */
+  CUBLASLT_MATMUL_PREF_GAUSSIAN_MODE_MASK = 4,
+
+  /** Minimum buffer alignment for matrix A (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5,
+
+  /** Minimum buffer alignment for matrix B (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6,
+
+  /** Minimum buffer alignment for matrix C (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7,
+
+  /** Minimum buffer alignment for matrix D (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8,
+
+  /** Maximum wave count.
+   *
+   * See cublasLtMatmulHeuristicResult_t::wavesCount.
+   *
+   * Selecting a non-zero value will exclude algorithms that report device utilization higher than specified.
+   *
+   * float, default: 0.0f
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9,
+
+  /** Pointer mode mask, see cublasLtPointerModeMask_t. Filters heuristic result to only include algorithms that support
+   * all required modes.
+   *
+   * uint32_t, default: (CUBLASLT_POINTER_MODE_MASK_HOST | CUBLASLT_POINTER_MODE_MASK_DEVICE) (only allows algorithms
+   * that support both regular host and device pointers)
+   */
+  CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK = 10,
+
+  /** Epilogue selector mask, see cublasLtEpilogue_t. Filters heuristic result to only include algorithms that support
+   * all required operations.
+   *
+   * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT (only allows algorithms that support default epilogue)
+   */
+  CUBLASLT_MATMUL_PREF_EPILOGUE_MASK = 11,
+
+  /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include
+   * algorithms that use the allowed implementations.
+   *
+   * uint64_t, default: uint64_t(-1) (allow everything)
+   */
+  CUBLASLT_MATMUL_PREF_IMPL_MASK = 12,
+
+  /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
+   * when user expects a concurrent stream to be using some of the device resources.
+   *
+   * Overrides the SM count target set in the matrix multiplication descriptor (see cublasLtMatmulDescAttributes_t).
+   *
+   * int32_t, default: 0 - use the number reported by the device.
+   * DEPRECATED, will be removed in a future release, see cublasLtMatmulDescAttributes_t for replacement
+   */
+  CUBLASLT_MATMUL_PREF_SM_COUNT_TARGET = 13,
+} cublasLtMatmulPreferenceAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size);
+
+/** Initialize matmul heuristic search preference descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatmulPreferenceInit(cublasLtMatmulPreference_t pref) {
+  return cublasLtMatmulPreferenceInit_internal(pref, sizeof(*pref));
+}
+
+/** Create new matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t* pref);
+
+/** Destroy matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref);
+
+/** Set matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/** Results structure used by cublasLtMatmulGetAlgo.
+ *
+ * Holds returned configured algo descriptor and its runtime properties.
+ */
+typedef struct {
+  /** Matmul algorithm descriptor.
+   *
+   * Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to
+   * CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID
+   */
+  cublasLtMatmulAlgo_t algo;
+
+  /** Actual size of workspace memory required.
+   */
+  size_t workspaceSize;
+
+  /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to
+   * CUBLAS_STATUS_SUCCESS.
+   */
+  cublasStatus_t state;
+
+  /** Waves count - a device utilization metric.
+   *
+   * wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU.
+   */
+  float wavesCount;
+
+  int reserved[4];
+} cublasLtMatmulHeuristicResult_t;
+
+/** Query cublasLt heuristic for algorithm appropriate for given use case.
+ *
+ * \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
+ *                                        context. See cublasLtHandle_t.
+ * \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
+ * \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
+ * \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
+ * \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
+ * \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
+ * \param[in]      preference             Pointer to the structure holding the heuristic search
+ *                                        preferences descriptor. See cublasLtMatrixLayout_t.
+ * \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
+ *                                        maximum number of algorithms to return.
+ * \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
+ *                                        ordered in increasing estimated compute time.
+ * \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
+ *
+ * \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
+ * \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
+ * \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
+ *                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
+ *                                        for detail status of results
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(cublasLtHandle_t lightHandle,
+                                                           cublasLtMatmulDesc_t operationDesc,
+                                                           cublasLtMatrixLayout_t Adesc,
+                                                           cublasLtMatrixLayout_t Bdesc,
+                                                           cublasLtMatrixLayout_t Cdesc,
+                                                           cublasLtMatrixLayout_t Ddesc,
+                                                           cublasLtMatmulPreference_t preference,
+                                                           int requestedAlgoCount,
+                                                           cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
+                                                           int* returnAlgoCount);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Lower level API to be able to implement own Heuristic and Find routines                */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Routine to get all algo IDs that can potentially run
+ *
+ * \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
+ * (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
+ * actually written
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
+ * \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
+ *                                          available
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(cublasLtHandle_t lightHandle,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType,
+                                                     cudaDataType_t Atype,
+                                                     cudaDataType_t Btype,
+                                                     cudaDataType_t Ctype,
+                                                     cudaDataType_t Dtype,
+                                                     int requestedAlgoCount,
+                                                     int algoIdsArray[],
+                                                     int* returnAlgoCount);
+
+/** Initialize algo structure
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
+ * \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(cublasLtHandle_t lightHandle,
+                                                   cublasComputeType_t computeType,
+                                                   cudaDataType_t scaleType,
+                                                   cudaDataType_t Atype,
+                                                   cudaDataType_t Btype,
+                                                   cudaDataType_t Ctype,
+                                                   cudaDataType_t Dtype,
+                                                   int algoId,
+                                                   cublasLtMatmulAlgo_t* algo);
+
+/** Check configured algo descriptor for correctness and support on current device.
+ *
+ * Result includes required workspace size and calculated wave count.
+ *
+ * CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
+ * but if cublasLtMatmulAlgoCheck fails, the algo will not run.
+ *
+ * \param[in]  algo    algo configuration to check
+ * \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
+ *                                          descriptor
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
+ *                                          given device
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
+ * \retval     CUBLAS_STATUS_SUCCESS        if check was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
+    cublasLtHandle_t lightHandle,
+    cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc,
+    cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc,
+    cublasLtMatrixLayout_t Ddesc,
+    const cublasLtMatmulAlgo_t* algo,  ///< may point to result->algo
+    cublasLtMatmulHeuristicResult_t* result);
+
+/** Capabilities Attributes that can be retrieved from an initialized Algo structure
+ */
+typedef enum {
+  /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0,
+  /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is
+   * not masked out it is supported.
+   *
+   * e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) ==
+   * CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0;
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1,
+  /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
+   *
+   * uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved
+   */
+  CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2,
+  /** support strided batch
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3,
+  /** support results out of place (D != C in D = alpha.A.B + beta.C)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4,
+  /** syrk/herk support (on top of regular gemm)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5,
+  /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use
+   * CUBLASLT_MATMUL_TILE_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_TILE_IDS = 6,
+  /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see
+   * CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
+   *
+   * int32_t
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7,
+  /** whether algorithm is using regular compute or tensor operations
+   *
+   * int32_t 0 means regular compute, 1 means tensor operations;
+   * DEPRECATED
+   */
+  CUBLASLT_ALGO_CAP_MATHMODE_IMPL = 8,
+  /** whether algorithm implements gaussian optimization of complex matrix multiplication, see cublasMath_t
+   *
+   * int32_t 0 means regular compute, 1 means gaussian;
+   * DEPRECATED
+   */
+  CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL = 9,
+  /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t
+   *
+   * int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different
+   * requirements;
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10,
+
+  /** bitmask enumerating pointer modes algorithm supports
+   *
+   * uint32_t, see cublasLtPointerModeMask_t
+   */
+  CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11,
+
+  /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue
+   *
+   * uint32_t, see cublasLtEpilogue_t
+   */
+  CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12,
+  /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use
+   * CUBLASLT_MATMUL_STAGES_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_STAGES_IDS = 13,
+  /** support for nagative ld for all of the matrices
+   *
+   * int32_t 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14,
+  /** details about algorithm's implementation that affect it's numerical behavior
+   *
+   * uint64_t, see cublasLtNumericalImplFlags_t
+   */
+  CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15,
+  /** minimum alignment required for A matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16,
+  /** minimum alignment required for B matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17,
+  /** minimum alignment required for C matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18,
+  /** minimum alignment required for D matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19,
+} cublasLtMatmulAlgoCapAttributes_t;
+
+/** Get algo capability attribute.
+ *
+ * E.g. to get list of supported Tile IDs:
+ *      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
+ *      size_t num_tiles, size_written;
+ *      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
+ * CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
+ *      }
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                              cublasLtMatmulAlgoCapAttributes_t attr,
+                                                              void* buf,
+                                                              size_t sizeInBytes,
+                                                              size_t* sizeWritten);
+
+/** Algo Configuration Attributes that can be set according to the Algo capabilities
+ */
+typedef enum {
+  /** algorithm index, see cublasLtMatmulAlgoGetIds()
+   *
+   * readonly, set by cublasLtMatmulAlgoInit()
+   * int32_t
+   */
+  CUBLASLT_ALGO_CONFIG_ID = 0,
+  /** tile id, see cublasLtMatmulTile_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_TILE_ID = 1,
+  /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts
+   * of matrix multiplication will be computed in parallel. The results will be accumulated
+   * according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2,
+  /** reduction scheme, see cublasLtReductionScheme_t
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE
+   */
+  CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3,
+  /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices
+   *
+   * possible values: 0, 1, other values reserved
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4,
+  /** custom option, each algorithm can support some custom options that don't fit description of the other config
+   * attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5,
+  /** stages id, see cublasLtMatmulStages_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_STAGES_ID = 6,
+  /** inner shape id, see cublasLtMatmulInnerShape_t
+   *
+   * uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED)
+   */
+  CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7,
+  /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use.
+   *
+   * uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO)
+   */
+  CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8,
+} cublasLtMatmulAlgoConfigAttributes_t;
+
+/** Set algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 const void* buf,
+                                                                 size_t sizeInBytes);
+
+/** Get algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 void* buf,
+                                                                 size_t sizeInBytes,
+                                                                 size_t* sizeWritten);
+
+/** Experimental: Logger callback type.
+ */
+typedef void (*cublasLtLoggerCallback_t)(int logLevel, const char* functionName, const char* message);
+
+/** Experimental: Logger callback setter.
+ *
+ * \param[in]  callback                     a user defined callback function to be called by the logger
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetCallback(cublasLtLoggerCallback_t callback);
+
+/** Experimental: Log file setter.
+ *
+ * \param[in]  file                         an open file with write permissions
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetFile(FILE* file);
+
+/** Experimental: Open log file.
+ *
+ * \param[in]  logFile                      log file path. if the log file does not exist, it will be created
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerOpenFile(const char* logFile);
+
+/** Experimental: Log level setter.
+ *
+ * \param[in]  level                        log level, should be one of the following:
+ *                                          0. Off
+ *                                          1. Errors
+ *                                          2. Performance Trace
+ *                                          3. Performance Hints
+ *                                          4. Heuristics Trace
+ *                                          5. API Trace
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetLevel(int level);
+
+/** Experimental: Log mask setter.
+ *
+ * \param[in]  mask                         log mask, should be a combination of the following masks:
+ *                                          0.  Off
+ *                                          1.  Errors
+ *                                          2.  Performance Trace
+ *                                          4.  Performance Hints
+ *                                          8.  Heuristics Trace
+ *                                          16. API Trace
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetMask(int mask);
+
+/** Experimental: Disable logging for the entire session.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if disabled logging
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerForceDisable();
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..aec546259cff3ece866faf6dd35f5909a82d4d23
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h
@@ -0,0 +1,3478 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+
+#if !defined(CUBLAS_API_H_)
+#define CUBLAS_API_H_
+
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+
+#ifndef CUBLASAPI
+#error "This file should not be included without defining CUBLASAPI"
+#endif
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <library_types.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define CUBLAS_VER_MAJOR 11
+#define CUBLAS_VER_MINOR 11
+#define CUBLAS_VER_PATCH 3
+#define CUBLAS_VER_BUILD 6
+#define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH)
+
+/* CUBLAS status type returns */
+typedef enum {
+  CUBLAS_STATUS_SUCCESS = 0,
+  CUBLAS_STATUS_NOT_INITIALIZED = 1,
+  CUBLAS_STATUS_ALLOC_FAILED = 3,
+  CUBLAS_STATUS_INVALID_VALUE = 7,
+  CUBLAS_STATUS_ARCH_MISMATCH = 8,
+  CUBLAS_STATUS_MAPPING_ERROR = 11,
+  CUBLAS_STATUS_EXECUTION_FAILED = 13,
+  CUBLAS_STATUS_INTERNAL_ERROR = 14,
+  CUBLAS_STATUS_NOT_SUPPORTED = 15,
+  CUBLAS_STATUS_LICENSE_ERROR = 16
+} cublasStatus_t;
+
+typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t;
+
+typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t;
+
+typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t;
+
+typedef enum {
+  CUBLAS_OP_N = 0,
+  CUBLAS_OP_T = 1,
+  CUBLAS_OP_C = 2,
+  CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */
+  CUBLAS_OP_CONJG = 3     /* conjugate, placeholder - not supported in the current release */
+} cublasOperation_t;
+
+typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t;
+
+typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t;
+
+/*For different GEMM algorithm */
+typedef enum {
+  CUBLAS_GEMM_DFALT = -1,
+  CUBLAS_GEMM_DEFAULT = -1,
+  CUBLAS_GEMM_ALGO0 = 0,
+  CUBLAS_GEMM_ALGO1 = 1,
+  CUBLAS_GEMM_ALGO2 = 2,
+  CUBLAS_GEMM_ALGO3 = 3,
+  CUBLAS_GEMM_ALGO4 = 4,
+  CUBLAS_GEMM_ALGO5 = 5,
+  CUBLAS_GEMM_ALGO6 = 6,
+  CUBLAS_GEMM_ALGO7 = 7,
+  CUBLAS_GEMM_ALGO8 = 8,
+  CUBLAS_GEMM_ALGO9 = 9,
+  CUBLAS_GEMM_ALGO10 = 10,
+  CUBLAS_GEMM_ALGO11 = 11,
+  CUBLAS_GEMM_ALGO12 = 12,
+  CUBLAS_GEMM_ALGO13 = 13,
+  CUBLAS_GEMM_ALGO14 = 14,
+  CUBLAS_GEMM_ALGO15 = 15,
+  CUBLAS_GEMM_ALGO16 = 16,
+  CUBLAS_GEMM_ALGO17 = 17,
+  CUBLAS_GEMM_ALGO18 = 18,  // sliced 32x32
+  CUBLAS_GEMM_ALGO19 = 19,  // sliced 64x32
+  CUBLAS_GEMM_ALGO20 = 20,  // sliced 128x32
+  CUBLAS_GEMM_ALGO21 = 21,  // sliced 32x32  -splitK
+  CUBLAS_GEMM_ALGO22 = 22,  // sliced 64x32  -splitK
+  CUBLAS_GEMM_ALGO23 = 23,  // sliced 128x32 -splitK
+  CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99,
+  CUBLAS_GEMM_DFALT_TENSOR_OP = 99,
+  CUBLAS_GEMM_ALGO0_TENSOR_OP = 100,
+  CUBLAS_GEMM_ALGO1_TENSOR_OP = 101,
+  CUBLAS_GEMM_ALGO2_TENSOR_OP = 102,
+  CUBLAS_GEMM_ALGO3_TENSOR_OP = 103,
+  CUBLAS_GEMM_ALGO4_TENSOR_OP = 104,
+  CUBLAS_GEMM_ALGO5_TENSOR_OP = 105,
+  CUBLAS_GEMM_ALGO6_TENSOR_OP = 106,
+  CUBLAS_GEMM_ALGO7_TENSOR_OP = 107,
+  CUBLAS_GEMM_ALGO8_TENSOR_OP = 108,
+  CUBLAS_GEMM_ALGO9_TENSOR_OP = 109,
+  CUBLAS_GEMM_ALGO10_TENSOR_OP = 110,
+  CUBLAS_GEMM_ALGO11_TENSOR_OP = 111,
+  CUBLAS_GEMM_ALGO12_TENSOR_OP = 112,
+  CUBLAS_GEMM_ALGO13_TENSOR_OP = 113,
+  CUBLAS_GEMM_ALGO14_TENSOR_OP = 114,
+  CUBLAS_GEMM_ALGO15_TENSOR_OP = 115
+} cublasGemmAlgo_t;
+
+/*Enum for default math mode/tensor operation*/
+typedef enum {
+  CUBLAS_DEFAULT_MATH = 0,
+
+  /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */
+  CUBLAS_TENSOR_OP_MATH = 1,
+
+  /* same as using matching _PEDANTIC compute type when using cublas<T>routine calls or cublasEx() calls with
+     cudaDataType as compute type */
+  CUBLAS_PEDANTIC_MATH = 2,
+
+  /* allow accelerating single precision routines using TF32 tensor cores */
+  CUBLAS_TF32_TENSOR_OP_MATH = 3,
+
+  /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines
+     with lower size output type */
+  CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16,
+} cublasMath_t;
+
+/* For backward compatibility purposes */
+typedef cudaDataType cublasDataType_t;
+
+/* Enum for compute type
+ *
+ * - default types provide best available performance using all available hardware features
+ *   and guarantee internal storage precision with at least the same precision and range;
+ * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format;
+ * - _FAST types allow for some loss of precision to enable higher throughput arithmetic.
+ */
+typedef enum {
+  CUBLAS_COMPUTE_16F = 64,           /* half - default */
+  CUBLAS_COMPUTE_16F_PEDANTIC = 65,  /* half - pedantic */
+  CUBLAS_COMPUTE_32F = 68,           /* float - default */
+  CUBLAS_COMPUTE_32F_PEDANTIC = 69,  /* float - pedantic */
+  CUBLAS_COMPUTE_32F_FAST_16F = 74,  /* float - fast, allows down-converting inputs to half or TF32 */
+  CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */
+  CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */
+  CUBLAS_COMPUTE_64F = 70,           /* double - default */
+  CUBLAS_COMPUTE_64F_PEDANTIC = 71,  /* double - pedantic */
+  CUBLAS_COMPUTE_32I = 72,           /* signed 32-bit int - default */
+  CUBLAS_COMPUTE_32I_PEDANTIC = 73,  /* signed 32-bit int - pedantic */
+} cublasComputeType_t;
+
+/* Opaque structure holding CUBLAS library context */
+struct cublasContext;
+typedef struct cublasContext* cublasHandle_t;
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int* version);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int* value);
+CUBLASAPI size_t CUBLASWINAPI cublasGetCudartVersion(void);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle,
+                                                            void* workspace,
+                                                            size_t workspaceSizeInBytes);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget);
+
+CUBLASAPI const char* CUBLASWINAPI cublasGetStatusName(cublasStatus_t status);
+CUBLASAPI const char* CUBLASWINAPI cublasGetStatusString(cublasStatus_t status);
+
+/* Cublas logging */
+typedef void (*cublasLogCallback)(const char* msg);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn,
+                                                            int logToStdOut,
+                                                            int logToStdErr,
+                                                            const char* logFileName);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback);
+
+/*
+ * cublasStatus_t
+ * cublasSetVector (int n, int elemSize, const void *x, int incx,
+ *                  void *y, int incy)
+ *
+ * copies n elements from a vector x in CPU memory space to a vector y
+ * in GPU memory space. Elements in both vectors are assumed to have a
+ * size of elemSize bytes. Storage spacing between consecutive elements
+ * is incx for the source vector x and incy for the destination vector
+ * y. In general, y points to an object, or part of an object, allocated
+ * via cublasAlloc(). Column major format for two-dimensional matrices
+ * is assumed throughout CUBLAS. Therefore, if the increment for a vector
+ * is equal to 1, this access a column vector while using an increment
+ * equal to the leading dimension of the respective matrix accesses a
+ * row vector.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if incx, incy, or elemSize <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if an error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy);
+
+/*
+ * cublasStatus_t
+ * cublasGetVector (int n, int elemSize, const void *x, int incx,
+ *                  void *y, int incy)
+ *
+ * copies n elements from a vector x in GPU memory space to a vector y
+ * in CPU memory space. Elements in both vectors are assumed to have a
+ * size of elemSize bytes. Storage spacing between consecutive elements
+ * is incx for the source vector x and incy for the destination vector
+ * y. In general, x points to an object, or part of an object, allocated
+ * via cublasAlloc(). Column major format for two-dimensional matrices
+ * is assumed throughout CUBLAS. Therefore, if the increment for a vector
+ * is equal to 1, this access a column vector while using an increment
+ * equal to the leading dimension of the respective matrix accesses a
+ * row vector.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if incx, incy, or elemSize <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if an error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
+
+/*
+ * cublasStatus_t
+ * cublasSetMatrix (int rows, int cols, int elemSize, const void *A,
+ *                  int lda, void *B, int ldb)
+ *
+ * copies a tile of rows x cols elements from a matrix A in CPU memory
+ * space to a matrix B in GPU memory space. Each element requires storage
+ * of elemSize bytes. Both matrices are assumed to be stored in column
+ * major format, with the leading dimension (i.e. number of rows) of
+ * source matrix A provided in lda, and the leading dimension of matrix B
+ * provided in ldb. In general, B points to an object, or part of an
+ * object, that was allocated via cublasAlloc().
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if rows or cols < 0, or elemSize, lda, or
+ *                                ldb <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+
+/*
+ * cublasStatus_t
+ * cublasGetMatrix (int rows, int cols, int elemSize, const void *A,
+ *                  int lda, void *B, int ldb)
+ *
+ * copies a tile of rows x cols elements from a matrix A in GPU memory
+ * space to a matrix B in CPU memory space. Each element requires storage
+ * of elemSize bytes. Both matrices are assumed to be stored in column
+ * major format, with the leading dimension (i.e. number of rows) of
+ * source matrix A provided in lda, and the leading dimension of matrix B
+ * provided in ldb. In general, A points to an object, or part of an
+ * object, that was allocated via cublasAlloc().
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if rows, cols, eleSize, lda, or ldb <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+
+/*
+ * cublasStatus
+ * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx,
+ *                       void *y, int incy, cudaStream_t stream );
+ *
+ * cublasSetVectorAsync has the same functionnality as cublasSetVector
+ * but the transfer is done asynchronously within the CUDA stream passed
+ * in parameter.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if incx, incy, or elemSize <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if an error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(
+    int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream);
+/*
+ * cublasStatus
+ * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx,
+ *                       void *y, int incy, cudaStream_t stream)
+ *
+ * cublasGetVectorAsync has the same functionnality as cublasGetVector
+ * but the transfer is done asynchronously within the CUDA stream passed
+ * in parameter.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if incx, incy, or elemSize <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if an error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(
+    int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream);
+
+/*
+ * cublasStatus_t
+ * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A,
+ *                       int lda, void *B, int ldb, cudaStream_t stream)
+ *
+ * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix
+ * but the transfer is done asynchronously within the CUDA stream passed
+ * in parameter.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if rows or cols < 0, or elemSize, lda, or
+ *                                ldb <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI
+cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+
+/*
+ * cublasStatus_t
+ * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A,
+ *                       int lda, void *B, int ldb, cudaStream_t stream)
+ *
+ * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix
+ * but the transfer is done asynchronously within the CUDA stream passed
+ * in parameter.
+ *
+ * Return Values
+ * -------------
+ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized
+ * CUBLAS_STATUS_INVALID_VALUE    if rows, cols, eleSize, lda, or ldb <= 0
+ * CUBLAS_STATUS_MAPPING_ERROR    if error occurred accessing GPU memory
+ * CUBLAS_STATUS_SUCCESS          if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI
+cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+
+CUBLASAPI void CUBLASWINAPI cublasXerbla(const char* srName, int info);
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* result,
+                                                   cudaDataType resultType,
+                                                   cudaDataType executionType); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(
+    cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle,
+                                                  int n,
+                                                  const void* x,
+                                                  cudaDataType xType,
+                                                  int incx,
+                                                  const void* y,
+                                                  cudaDataType yType,
+                                                  int incy,
+                                                  void* result,
+                                                  cudaDataType resultType,
+                                                  cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   const void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   void* result,
+                                                   cudaDataType resultType,
+                                                   cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    const float* x,
+                                                    int incx,
+                                                    const float* y,
+                                                    int incy,
+                                                    float* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    const double* x,
+                                                    int incx,
+                                                    const double* y,
+                                                    int incy,
+                                                    double* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* alpha, /* host or device pointer */
+                                                   cudaDataType alphaType,
+                                                   void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle,
+                                                      int n,
+                                                      const float* alpha, /* host or device pointer */
+                                                      cuComplex* x,
+                                                      int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle,
+                                                      int n,
+                                                      const double* alpha, /* host or device pointer */
+                                                      cuDoubleComplex* x,
+                                                      int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* alpha, /* host or device pointer */
+                                                   cudaDataType alphaType,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* x,
+                                                     int incx,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* x,
+                                                     int incx,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx(
+    cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx(
+    cublasHandle_t handle, int n, void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2(
+    cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIamaxEx(
+    cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */
+);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2(
+    cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIaminEx(
+    cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */
+);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* result,
+                                                   cudaDataType resultType, /* host or device pointer */
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2(
+    cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    float* x,
+                                                    int incx,
+                                                    float* y,
+                                                    int incy,
+                                                    const float* c,  /* host or device pointer */
+                                                    const float* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    double* x,
+                                                    int incx,
+                                                    double* y,
+                                                    int incy,
+                                                    const double* c,  /* host or device pointer */
+                                                    const double* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* y,
+                                                    int incy,
+                                                    const float* c,      /* host or device pointer */
+                                                    const cuComplex* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     cuComplex* x,
+                                                     int incx,
+                                                     cuComplex* y,
+                                                     int incy,
+                                                     const float* c,  /* host or device pointer */
+                                                     const float* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* y,
+                                                    int incy,
+                                                    const double* c,           /* host or device pointer */
+                                                    const cuDoubleComplex* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     cuDoubleComplex* x,
+                                                     int incx,
+                                                     cuDoubleComplex* y,
+                                                     int incy,
+                                                     const double* c,  /* host or device pointer */
+                                                     const double* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx(cublasHandle_t handle,
+                                                  int n,
+                                                  void* x,
+                                                  cudaDataType xType,
+                                                  int incx,
+                                                  void* y,
+                                                  cudaDataType yType,
+                                                  int incy,
+                                                  const void* c, /* host or device pointer */
+                                                  const void* s,
+                                                  cudaDataType csType,
+                                                  cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle,
+                                                     float* a,  /* host or device pointer */
+                                                     float* b,  /* host or device pointer */
+                                                     float* c,  /* host or device pointer */
+                                                     float* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle,
+                                                     double* a,  /* host or device pointer */
+                                                     double* b,  /* host or device pointer */
+                                                     double* c,  /* host or device pointer */
+                                                     double* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle,
+                                                     cuComplex* a,  /* host or device pointer */
+                                                     cuComplex* b,  /* host or device pointer */
+                                                     float* c,      /* host or device pointer */
+                                                     cuComplex* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle,
+                                                     cuDoubleComplex* a,  /* host or device pointer */
+                                                     cuDoubleComplex* b,  /* host or device pointer */
+                                                     double* c,           /* host or device pointer */
+                                                     cuDoubleComplex* s); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                                   void* a, /* host or device pointer */
+                                                   void* b, /* host or device pointer */
+                                                   cudaDataType abType,
+                                                   void* c, /* host or device pointer */
+                                                   void* s, /* host or device pointer */
+                                                   cudaDataType csType,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     float* x,
+                                                     int incx,
+                                                     float* y,
+                                                     int incy,
+                                                     const float* param); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     double* x,
+                                                     int incx,
+                                                     double* y,
+                                                     int incy,
+                                                     const double* param); /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle,
+                                                   int n,
+                                                   void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   const void* param, /* host or device pointer */
+                                                   cudaDataType paramType,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle,
+                                                      float* d1,       /* host or device pointer */
+                                                      float* d2,       /* host or device pointer */
+                                                      float* x1,       /* host or device pointer */
+                                                      const float* y1, /* host or device pointer */
+                                                      float* param);   /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle,
+                                                      double* d1,       /* host or device pointer */
+                                                      double* d2,       /* host or device pointer */
+                                                      double* x1,       /* host or device pointer */
+                                                      const double* y1, /* host or device pointer */
+                                                      double* param);   /* host or device pointer */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle,
+                                                    void* d1, /* host or device pointer */
+                                                    cudaDataType d1Type,
+                                                    void* d2, /* host or device pointer */
+                                                    cudaDataType d2Type,
+                                                    void* x1, /* host or device pointer */
+                                                    cudaDataType x1Type,
+                                                    const void* y1, /* host or device pointer */
+                                                    cudaDataType y1Type,
+                                                    void* param, /* host or device pointer */
+                                                    cudaDataType paramType,
+                                                    cudaDataType executiontype);
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+
+/* GEMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+/* GBMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+/* TRMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+/* TBMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+/* TPMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* AP,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* AP,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* AP,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* AP,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+/* TRSV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+/* TPSV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* AP,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* AP,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* AP,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* AP,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+/* TBSV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+/* SYMV/HEMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+/* SBMV/HBMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+/* SPMV/HPMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* AP,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* AP,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* AP,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* AP,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+/* GER */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2(cublasHandle_t handle,
+                                                    int m,
+                                                    int n,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const float* x,
+                                                    int incx,
+                                                    const float* y,
+                                                    int incy,
+                                                    float* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2(cublasHandle_t handle,
+                                                    int m,
+                                                    int n,
+                                                    const double* alpha, /* host or device pointer */
+                                                    const double* x,
+                                                    int incx,
+                                                    const double* y,
+                                                    int incy,
+                                                    double* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+/* SYR/HER */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const float* x,
+                                                    int incx,
+                                                    float* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha, /* host or device pointer */
+                                                    const double* x,
+                                                    int incx,
+                                                    double* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const cuComplex* alpha, /* host or device pointer */
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const cuDoubleComplex* alpha, /* host or device pointer */
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha, /* host or device pointer */
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* A,
+                                                    int lda);
+
+/* SPR/HPR */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const float* x,
+                                                    int incx,
+                                                    float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha, /* host or device pointer */
+                                                    const double* x,
+                                                    int incx,
+                                                    double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha, /* host or device pointer */
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* AP);
+
+/* SYR2/HER2 */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* y,
+                                                     int incy,
+                                                     float* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* y,
+                                                     int incy,
+                                                     double* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+/* SPR2/HPR2 */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* y,
+                                                     int incy,
+                                                     float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* y,
+                                                     int incy,
+                                                     double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* AP);
+/* BATCH GEMV */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const float* alpha, /* host or device pointer */
+                                                         const float* const Aarray[],
+                                                         int lda,
+                                                         const float* const xarray[],
+                                                         int incx,
+                                                         const float* beta, /* host or device pointer */
+                                                         float* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const double* alpha, /* host or device pointer */
+                                                         const double* const Aarray[],
+                                                         int lda,
+                                                         const double* const xarray[],
+                                                         int incx,
+                                                         const double* beta, /* host or device pointer */
+                                                         double* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const cuComplex* alpha, /* host or device pointer */
+                                                         const cuComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuComplex* const xarray[],
+                                                         int incx,
+                                                         const cuComplex* beta, /* host or device pointer */
+                                                         cuComplex* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const cuDoubleComplex* alpha, /* host or device pointer */
+                                                         const cuDoubleComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuDoubleComplex* const xarray[],
+                                                         int incx,
+                                                         const cuDoubleComplex* beta, /* host or device pointer */
+                                                         cuDoubleComplex* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+#if defined(__cplusplus)
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha, /* host or device pointer */
+                                                           const __half* const Aarray[],
+                                                           int lda,
+                                                           const __half* const xarray[],
+                                                           int incx,
+                                                           const float* beta, /* host or device pointer */
+                                                           __half* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha, /* host or device pointer */
+                                                           const __half* const Aarray[],
+                                                           int lda,
+                                                           const __half* const xarray[],
+                                                           int incx,
+                                                           const float* beta, /* host or device pointer */
+                                                           float* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha, /* host or device pointer */
+                                                           const __nv_bfloat16* const Aarray[],
+                                                           int lda,
+                                                           const __nv_bfloat16* const xarray[],
+                                                           int incx,
+                                                           const float* beta, /* host or device pointer */
+                                                           __nv_bfloat16* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha, /* host or device pointer */
+                                                           const __nv_bfloat16* const Aarray[],
+                                                           int lda,
+                                                           const __nv_bfloat16* const xarray[],
+                                                           int incx,
+                                                           const float* beta, /* host or device pointer */
+                                                           float* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+#endif
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const float* alpha, /* host or device pointer */
+                                                                const float* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const float* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const float* beta, /* host or device pointer */
+                                                                float* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const double* alpha, /* host or device pointer */
+                                                                const double* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const double* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const double* beta, /* host or device pointer */
+                                                                double* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const cuComplex* alpha, /* host or device pointer */
+                                                                const cuComplex* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const cuComplex* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const cuComplex* beta, /* host or device pointer */
+                                                                cuComplex* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZgemvStridedBatched(cublasHandle_t handle,
+                          cublasOperation_t trans,
+                          int m,
+                          int n,
+                          const cuDoubleComplex* alpha, /* host or device pointer */
+                          const cuDoubleComplex* A,
+                          int lda,
+                          long long int strideA, /* purposely signed */
+                          const cuDoubleComplex* x,
+                          int incx,
+                          long long int stridex,
+                          const cuDoubleComplex* beta, /* host or device pointer */
+                          cuDoubleComplex* y,
+                          int incy,
+                          long long int stridey,
+                          int batchCount);
+
+#if defined(__cplusplus)
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha, /* host or device pointer */
+                                                                  const __half* A,
+                                                                  int lda,
+                                                                  long long int strideA, /* purposely signed */
+                                                                  const __half* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta, /* host or device pointer */
+                                                                  __half* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha, /* host or device pointer */
+                                                                  const __half* A,
+                                                                  int lda,
+                                                                  long long int strideA, /* purposely signed */
+                                                                  const __half* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta, /* host or device pointer */
+                                                                  float* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha, /* host or device pointer */
+                                                                  const __nv_bfloat16* A,
+                                                                  int lda,
+                                                                  long long int strideA, /* purposely signed */
+                                                                  const __nv_bfloat16* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta, /* host or device pointer */
+                                                                  __nv_bfloat16* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha, /* host or device pointer */
+                                                                  const __nv_bfloat16* A,
+                                                                  int lda,
+                                                                  long long int strideA, /* purposely signed */
+                                                                  const __nv_bfloat16* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta, /* host or device pointer */
+                                                                  float* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+#endif
+/* ---------------- CUBLAS BLAS3 functions ---------------- */
+
+/* GEMM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha, /* host or device pointer */
+                                                    const cuComplex* A,
+                                                    int lda,
+                                                    const cuComplex* B,
+                                                    int ldb,
+                                                    const cuComplex* beta, /* host or device pointer */
+                                                    cuComplex* C,
+                                                    int ldc);
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(cublasHandle_t handle,
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb,
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void* B,
+                                                      cudaDataType Btype,
+                                                      int ldb,
+                                                      const cuComplex* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex* alpha, /* host or device pointer */
+                                                    const cuDoubleComplex* A,
+                                                    int lda,
+                                                    const cuDoubleComplex* B,
+                                                    int ldb,
+                                                    const cuDoubleComplex* beta, /* host or device pointer */
+                                                    cuDoubleComplex* C,
+                                                    int ldc);
+
+#if defined(__cplusplus)
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  int k,
+                                                  const __half* alpha, /* host or device pointer */
+                                                  const __half* A,
+                                                  int lda,
+                                                  const __half* B,
+                                                  int ldb,
+                                                  const __half* beta, /* host or device pointer */
+                                                  __half* C,
+                                                  int ldc);
+#endif
+/* IO in FP16/FP32, computation in float */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const void* B,
+                                                    cudaDataType Btype,
+                                                    int ldb,
+                                                    const float* beta, /* host or device pointer */
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx(cublasHandle_t handle,
+                                                   cublasOperation_t transa,
+                                                   cublasOperation_t transb,
+                                                   int m,
+                                                   int n,
+                                                   int k,
+                                                   const void* alpha, /* host or device pointer */
+                                                   const void* A,
+                                                   cudaDataType Atype,
+                                                   int lda,
+                                                   const void* B,
+                                                   cudaDataType Btype,
+                                                   int ldb,
+                                                   const void* beta, /* host or device pointer */
+                                                   void* C,
+                                                   cudaDataType Ctype,
+                                                   int ldc,
+                                                   cublasComputeType_t computeType,
+                                                   cublasGemmAlgo_t algo);
+
+/* IO in Int8 complex/cuComplex, computation in cuComplex */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha,
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const void* B,
+                                                    cudaDataType Btype,
+                                                    int ldb,
+                                                    const cuComplex* beta,
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb,
+                                                          cublasOperation_t transc,
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const unsigned char* A,
+                                                          int A_bias,
+                                                          int lda,
+                                                          const unsigned char* B,
+                                                          int B_bias,
+                                                          int ldb,
+                                                          unsigned char* C,
+                                                          int C_bias,
+                                                          int ldc,
+                                                          int C_mult,
+                                                          int C_shift);
+
+/* SYRK */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+/* IO in Int8 complex/cuComplex, computation in cuComplex */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const cuComplex* beta, /* host or device pointer */
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const cuComplex* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+/* HERK */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const float* beta, /* host or device pointer */
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const double* beta, /* host or device pointer */
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+/* IO in Int8 complex/cuComplex, computation in cuComplex */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const float* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const float* beta, /* host or device pointer */
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const float* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+/* SYR2K */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float* alpha, /* host or device pointer */
+                                                      const float* A,
+                                                      int lda,
+                                                      const float* B,
+                                                      int ldb,
+                                                      const float* beta, /* host or device pointer */
+                                                      float* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double* alpha, /* host or device pointer */
+                                                      const double* A,
+                                                      int lda,
+                                                      const double* B,
+                                                      int ldb,
+                                                      const double* beta, /* host or device pointer */
+                                                      double* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha, /* host or device pointer */
+                                                      const cuComplex* A,
+                                                      int lda,
+                                                      const cuComplex* B,
+                                                      int ldb,
+                                                      const cuComplex* beta, /* host or device pointer */
+                                                      cuComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex* alpha, /* host or device pointer */
+                                                      const cuDoubleComplex* A,
+                                                      int lda,
+                                                      const cuDoubleComplex* B,
+                                                      int ldb,
+                                                      const cuDoubleComplex* beta, /* host or device pointer */
+                                                      cuDoubleComplex* C,
+                                                      int ldc);
+/* HER2K */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha, /* host or device pointer */
+                                                      const cuComplex* A,
+                                                      int lda,
+                                                      const cuComplex* B,
+                                                      int ldb,
+                                                      const float* beta, /* host or device pointer */
+                                                      cuComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex* alpha, /* host or device pointer */
+                                                      const cuDoubleComplex* A,
+                                                      int lda,
+                                                      const cuDoubleComplex* B,
+                                                      int ldb,
+                                                      const double* beta, /* host or device pointer */
+                                                      cuDoubleComplex* C,
+                                                      int ldc);
+/* SYRKX : eXtended SYRK*/
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const float* alpha, /* host or device pointer */
+                                                   const float* A,
+                                                   int lda,
+                                                   const float* B,
+                                                   int ldb,
+                                                   const float* beta, /* host or device pointer */
+                                                   float* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const double* alpha, /* host or device pointer */
+                                                   const double* A,
+                                                   int lda,
+                                                   const double* B,
+                                                   int ldb,
+                                                   const double* beta, /* host or device pointer */
+                                                   double* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuComplex* alpha, /* host or device pointer */
+                                                   const cuComplex* A,
+                                                   int lda,
+                                                   const cuComplex* B,
+                                                   int ldb,
+                                                   const cuComplex* beta, /* host or device pointer */
+                                                   cuComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuDoubleComplex* alpha, /* host or device pointer */
+                                                   const cuDoubleComplex* A,
+                                                   int lda,
+                                                   const cuDoubleComplex* B,
+                                                   int ldb,
+                                                   const cuDoubleComplex* beta, /* host or device pointer */
+                                                   cuDoubleComplex* C,
+                                                   int ldc);
+/* HERKX : eXtended HERK */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuComplex* alpha, /* host or device pointer */
+                                                   const cuComplex* A,
+                                                   int lda,
+                                                   const cuComplex* B,
+                                                   int ldb,
+                                                   const float* beta, /* host or device pointer */
+                                                   cuComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuDoubleComplex* alpha, /* host or device pointer */
+                                                   const cuDoubleComplex* A,
+                                                   int lda,
+                                                   const cuDoubleComplex* B,
+                                                   int ldb,
+                                                   const double* beta, /* host or device pointer */
+                                                   cuDoubleComplex* C,
+                                                   int ldc);
+/* SYMM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     const float* beta, /* host or device pointer */
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     const double* beta, /* host or device pointer */
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+/* HEMM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta, /* host or device pointer */
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta, /* host or device pointer */
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+/* TRSM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     float* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     double* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* B,
+                                                     int ldb);
+
+/* TRMM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha, /* host or device pointer */
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha, /* host or device pointer */
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha, /* host or device pointer */
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha, /* host or device pointer */
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+/* BATCH GEMM */
+#if defined(__cplusplus)
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const __half* alpha, /* host or device pointer */
+                                                         const __half* const Aarray[],
+                                                         int lda,
+                                                         const __half* const Barray[],
+                                                         int ldb,
+                                                         const __half* beta, /* host or device pointer */
+                                                         __half* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+#endif
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const float* alpha, /* host or device pointer */
+                                                         const float* const Aarray[],
+                                                         int lda,
+                                                         const float* const Barray[],
+                                                         int ldb,
+                                                         const float* beta, /* host or device pointer */
+                                                         float* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const double* alpha, /* host or device pointer */
+                                                         const double* const Aarray[],
+                                                         int lda,
+                                                         const double* const Barray[],
+                                                         int ldb,
+                                                         const double* beta, /* host or device pointer */
+                                                         double* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const cuComplex* alpha, /* host or device pointer */
+                                                         const cuComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuComplex* const Barray[],
+                                                         int ldb,
+                                                         const cuComplex* beta, /* host or device pointer */
+                                                         cuComplex* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(cublasHandle_t handle,
+                                                           cublasOperation_t transa,
+                                                           cublasOperation_t transb,
+                                                           int m,
+                                                           int n,
+                                                           int k,
+                                                           const cuComplex* alpha, /* host or device pointer */
+                                                           const cuComplex* const Aarray[],
+                                                           int lda,
+                                                           const cuComplex* const Barray[],
+                                                           int ldb,
+                                                           const cuComplex* beta, /* host or device pointer */
+                                                           cuComplex* const Carray[],
+                                                           int ldc,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const cuDoubleComplex* alpha, /* host or device pointer */
+                                                         const cuDoubleComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuDoubleComplex* const Barray[],
+                                                         int ldb,
+                                                         const cuDoubleComplex* beta, /* host or device pointer */
+                                                         cuDoubleComplex* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb,
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const void* alpha, /* host or device pointer */
+                                                          const void* const Aarray[],
+                                                          cudaDataType Atype,
+                                                          int lda,
+                                                          const void* const Barray[],
+                                                          cudaDataType Btype,
+                                                          int ldb,
+                                                          const void* beta, /* host or device pointer */
+                                                          void* const Carray[],
+                                                          cudaDataType Ctype,
+                                                          int ldc,
+                                                          int batchCount,
+                                                          cublasComputeType_t computeType,
+                                                          cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb,
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const void* alpha, /* host or device pointer */
+                                                                 const void* A,
+                                                                 cudaDataType Atype,
+                                                                 int lda,
+                                                                 long long int strideA, /* purposely signed */
+                                                                 const void* B,
+                                                                 cudaDataType Btype,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const void* beta, /* host or device pointer */
+                                                                 void* C,
+                                                                 cudaDataType Ctype,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount,
+                                                                 cublasComputeType_t computeType,
+                                                                 cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const float* alpha, /* host or device pointer */
+                                                                const float* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const float* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const float* beta, /* host or device pointer */
+                                                                float* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const double* alpha, /* host or device pointer */
+                                                                const double* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const double* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const double* beta, /* host or device pointer */
+                                                                double* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const cuComplex* alpha, /* host or device pointer */
+                                                                const cuComplex* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const cuComplex* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const cuComplex* beta, /* host or device pointer */
+                                                                cuComplex* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t transa,
+                                                                  cublasOperation_t transb,
+                                                                  int m,
+                                                                  int n,
+                                                                  int k,
+                                                                  const cuComplex* alpha, /* host or device pointer */
+                                                                  const cuComplex* A,
+                                                                  int lda,
+                                                                  long long int strideA, /* purposely signed */
+                                                                  const cuComplex* B,
+                                                                  int ldb,
+                                                                  long long int strideB,
+                                                                  const cuComplex* beta, /* host or device pointer */
+                                                                  cuComplex* C,
+                                                                  int ldc,
+                                                                  long long int strideC,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZgemmStridedBatched(cublasHandle_t handle,
+                          cublasOperation_t transa,
+                          cublasOperation_t transb,
+                          int m,
+                          int n,
+                          int k,
+                          const cuDoubleComplex* alpha, /* host or device pointer */
+                          const cuDoubleComplex* A,
+                          int lda,
+                          long long int strideA, /* purposely signed */
+                          const cuDoubleComplex* B,
+                          int ldb,
+                          long long int strideB,
+                          const cuDoubleComplex* beta, /* host or device poi */
+                          cuDoubleComplex* C,
+                          int ldc,
+                          long long int strideC,
+                          int batchCount);
+
+#if defined(__cplusplus)
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const __half* alpha, /* host or device pointer */
+                                                                const __half* A,
+                                                                int lda,
+                                                                long long int strideA, /* purposely signed */
+                                                                const __half* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const __half* beta, /* host or device pointer */
+                                                                __half* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+#endif
+/* ---------------- CUBLAS BLAS-like extension ---------------- */
+/* GEAM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const float* alpha, /* host or device pointer */
+                                                  const float* A,
+                                                  int lda,
+                                                  const float* beta, /* host or device pointer */
+                                                  const float* B,
+                                                  int ldb,
+                                                  float* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const double* alpha, /* host or device pointer */
+                                                  const double* A,
+                                                  int lda,
+                                                  const double* beta, /* host or device pointer */
+                                                  const double* B,
+                                                  int ldb,
+                                                  double* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const cuComplex* alpha, /* host or device pointer */
+                                                  const cuComplex* A,
+                                                  int lda,
+                                                  const cuComplex* beta, /* host or device pointer */
+                                                  const cuComplex* B,
+                                                  int ldb,
+                                                  cuComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const cuDoubleComplex* alpha, /* host or device pointer */
+                                                  const cuDoubleComplex* A,
+                                                  int lda,
+                                                  const cuDoubleComplex* beta, /* host or device pointer */
+                                                  const cuDoubleComplex* B,
+                                                  int ldb,
+                                                  cuDoubleComplex* C,
+                                                  int ldc);
+
+/* Batched LU - GETRF*/
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          float* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          int* P,    /*Device Pointer*/
+                                                          int* info, /*Device Pointer*/
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          double* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          int* P,    /*Device Pointer*/
+                                                          int* info, /*Device Pointer*/
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          cuComplex* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          int* P,    /*Device Pointer*/
+                                                          int* info, /*Device Pointer*/
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          cuDoubleComplex* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          int* P,    /*Device Pointer*/
+                                                          int* info, /*Device Pointer*/
+                                                          int batchSize);
+
+/* Batched inversion based on LU factorization from getrf */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const float* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          const int* P,     /*Device pointer*/
+                                                          float* const C[], /*Device pointer*/
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const double* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          const int* P,      /*Device pointer*/
+                                                          double* const C[], /*Device pointer*/
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const cuComplex* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          const int* P,         /*Device pointer*/
+                                                          cuComplex* const C[], /*Device pointer*/
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const cuDoubleComplex* const A[], /*Device pointer*/
+                                                          int lda,
+                                                          const int* P,               /*Device pointer*/
+                                                          cuDoubleComplex* const C[], /*Device pointer*/
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+/* Batched solver based on LU factorization from getrf */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const float* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          float* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const double* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          double* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const cuComplex* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          cuComplex* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const cuDoubleComplex* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          cuDoubleComplex* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+/* TRSM - Batched Triangular Solver */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const float* alpha, /*Host or Device Pointer*/
+                                                         const float* const A[],
+                                                         int lda,
+                                                         float* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const double* alpha, /*Host or Device Pointer*/
+                                                         const double* const A[],
+                                                         int lda,
+                                                         double* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const cuComplex* alpha, /*Host or Device Pointer*/
+                                                         const cuComplex* const A[],
+                                                         int lda,
+                                                         cuComplex* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const cuDoubleComplex* alpha, /*Host or Device Pointer*/
+                                                         const cuDoubleComplex* const A[],
+                                                         int lda,
+                                                         cuDoubleComplex* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+/* Batched - MATINV*/
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const float* const A[], /*Device pointer*/
+                                                           int lda,
+                                                           float* const Ainv[], /*Device pointer*/
+                                                           int lda_inv,
+                                                           int* info, /*Device Pointer*/
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const double* const A[], /*Device pointer*/
+                                                           int lda,
+                                                           double* const Ainv[], /*Device pointer*/
+                                                           int lda_inv,
+                                                           int* info, /*Device Pointer*/
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const cuComplex* const A[], /*Device pointer*/
+                                                           int lda,
+                                                           cuComplex* const Ainv[], /*Device pointer*/
+                                                           int lda_inv,
+                                                           int* info, /*Device Pointer*/
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const cuDoubleComplex* const A[], /*Device pointer*/
+                                                           int lda,
+                                                           cuDoubleComplex* const Ainv[], /*Device pointer*/
+                                                           int lda_inv,
+                                                           int* info, /*Device Pointer*/
+                                                           int batchSize);
+
+/* Batch QR Factorization */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          float* const Aarray[], /*Device pointer*/
+                                                          int lda,
+                                                          float* const TauArray[], /*Device pointer*/
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          double* const Aarray[], /*Device pointer*/
+                                                          int lda,
+                                                          double* const TauArray[], /*Device pointer*/
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          cuComplex* const Aarray[], /*Device pointer*/
+                                                          int lda,
+                                                          cuComplex* const TauArray[], /*Device pointer*/
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          cuDoubleComplex* const Aarray[], /*Device pointer*/
+                                                          int lda,
+                                                          cuDoubleComplex* const TauArray[], /*Device pointer*/
+                                                          int* info,
+                                                          int batchSize);
+/* Least Square Min only m >= n and Non-transpose supported */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         float* const Aarray[], /*Device pointer*/
+                                                         int lda,
+                                                         float* const Carray[], /*Device pointer*/
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray, /*Device pointer*/
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         double* const Aarray[], /*Device pointer*/
+                                                         int lda,
+                                                         double* const Carray[], /*Device pointer*/
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray, /*Device pointer*/
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         cuComplex* const Aarray[], /*Device pointer*/
+                                                         int lda,
+                                                         cuComplex* const Carray[], /*Device pointer*/
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         cuDoubleComplex* const Aarray[], /*Device pointer*/
+                                                         int lda,
+                                                         cuDoubleComplex* const Carray[], /*Device pointer*/
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+/* DGMM */
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const float* A,
+                                                  int lda,
+                                                  const float* x,
+                                                  int incx,
+                                                  float* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const double* A,
+                                                  int lda,
+                                                  const double* x,
+                                                  int incx,
+                                                  double* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const cuComplex* A,
+                                                  int lda,
+                                                  const cuComplex* x,
+                                                  int incx,
+                                                  cuComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const cuDoubleComplex* A,
+                                                  int lda,
+                                                  const cuDoubleComplex* x,
+                                                  int incx,
+                                                  cuDoubleComplex* C,
+                                                  int ldc);
+
+/* TPTTR : Triangular Pack format to Triangular format */
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda);
+/* TRTTP : Triangular format to Triangular Pack format */
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP);
+
+#if defined(__cplusplus)
+}
+
+static inline cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle,
+                                                      cudaDataType_t dataType,
+                                                      cublasComputeType_t* computeType) {
+  cublasMath_t mathMode = CUBLAS_DEFAULT_MATH;
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  status = cublasGetMathMode(handle, &mathMode);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  bool isPedantic = ((mathMode & 0xf) == CUBLAS_PEDANTIC_MATH);
+
+  switch (dataType) {
+    case CUDA_R_32F:
+    case CUDA_C_32F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_64F:
+    case CUDA_C_64F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_16F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_32I:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+      return CUBLAS_STATUS_SUCCESS;
+    default:
+      return CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+}
+/* wrappers to accept old code with cudaDataType computeType when referenced from c++ code */
+static inline cublasStatus_t cublasGemmEx(cublasHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          int m,
+                                          int n,
+                                          int k,
+                                          const void* alpha, /* host or device pointer */
+                                          const void* A,
+                                          cudaDataType Atype,
+                                          int lda,
+                                          const void* B,
+                                          cudaDataType Btype,
+                                          int ldb,
+                                          const void* beta, /* host or device pointer */
+                                          void* C,
+                                          cudaDataType Ctype,
+                                          int ldc,
+                                          cudaDataType computeType,
+                                          cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType = CUBLAS_COMPUTE_32F;
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmEx(handle,
+                      transa,
+                      transb,
+                      m,
+                      n,
+                      k,
+                      alpha,
+                      A,
+                      Atype,
+                      lda,
+                      B,
+                      Btype,
+                      ldb,
+                      beta,
+                      C,
+                      Ctype,
+                      ldc,
+                      migratedComputeType,
+                      algo);
+}
+
+static inline cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle,
+                                                 cublasOperation_t transa,
+                                                 cublasOperation_t transb,
+                                                 int m,
+                                                 int n,
+                                                 int k,
+                                                 const void* alpha, /* host or device pointer */
+                                                 const void* const Aarray[],
+                                                 cudaDataType Atype,
+                                                 int lda,
+                                                 const void* const Barray[],
+                                                 cudaDataType Btype,
+                                                 int ldb,
+                                                 const void* beta, /* host or device pointer */
+                                                 void* const Carray[],
+                                                 cudaDataType Ctype,
+                                                 int ldc,
+                                                 int batchCount,
+                                                 cudaDataType computeType,
+                                                 cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType;
+  cublasStatus_t status;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmBatchedEx(handle,
+                             transa,
+                             transb,
+                             m,
+                             n,
+                             k,
+                             alpha,
+                             Aarray,
+                             Atype,
+                             lda,
+                             Barray,
+                             Btype,
+                             ldb,
+                             beta,
+                             Carray,
+                             Ctype,
+                             ldc,
+                             batchCount,
+                             migratedComputeType,
+                             algo);
+}
+
+static inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int m,
+                                                        int n,
+                                                        int k,
+                                                        const void* alpha, /* host or device pointer */
+                                                        const void* A,
+                                                        cudaDataType Atype,
+                                                        int lda,
+                                                        long long int strideA, /* purposely signed */
+                                                        const void* B,
+                                                        cudaDataType Btype,
+                                                        int ldb,
+                                                        long long int strideB,
+                                                        const void* beta, /* host or device pointer */
+                                                        void* C,
+                                                        cudaDataType Ctype,
+                                                        int ldc,
+                                                        long long int strideC,
+                                                        int batchCount,
+                                                        cudaDataType computeType,
+                                                        cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType;
+  cublasStatus_t status;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmStridedBatchedEx(handle,
+                                    transa,
+                                    transb,
+                                    m,
+                                    n,
+                                    k,
+                                    alpha,
+                                    A,
+                                    Atype,
+                                    lda,
+                                    strideA,
+                                    B,
+                                    Btype,
+                                    ldb,
+                                    strideB,
+                                    beta,
+                                    C,
+                                    Ctype,
+                                    ldc,
+                                    strideC,
+                                    batchCount,
+                                    migratedComputeType,
+                                    algo);
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_API_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..34d6b77ccfb485d39e5bb261a3ebb1ad592ea281
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the new CUBLAS library API, it mapped the generic
+ * Cublas name functions to the actual _v2 implementations.
+ */
+
+#if !defined(CUBLAS_V2_H_)
+#define CUBLAS_V2_H_
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#define cublasCreate cublasCreate_v2
+#define cublasDestroy cublasDestroy_v2
+#define cublasGetVersion cublasGetVersion_v2
+#define cublasSetWorkspace cublasSetWorkspace_v2
+#define cublasSetStream cublasSetStream_v2
+#define cublasGetStream cublasGetStream_v2
+#define cublasGetPointerMode cublasGetPointerMode_v2
+#define cublasSetPointerMode cublasSetPointerMode_v2
+
+/* Blas3 Routines   */
+
+#define cublasSnrm2 cublasSnrm2_v2
+#define cublasDnrm2 cublasDnrm2_v2
+#define cublasScnrm2 cublasScnrm2_v2
+#define cublasDznrm2 cublasDznrm2_v2
+
+#define cublasSdot cublasSdot_v2
+#define cublasDdot cublasDdot_v2
+#define cublasCdotu cublasCdotu_v2
+#define cublasCdotc cublasCdotc_v2
+#define cublasZdotu cublasZdotu_v2
+#define cublasZdotc cublasZdotc_v2
+
+#define cublasSscal cublasSscal_v2
+#define cublasDscal cublasDscal_v2
+#define cublasCscal cublasCscal_v2
+#define cublasCsscal cublasCsscal_v2
+#define cublasZscal cublasZscal_v2
+#define cublasZdscal cublasZdscal_v2
+
+#define cublasSaxpy cublasSaxpy_v2
+#define cublasDaxpy cublasDaxpy_v2
+#define cublasCaxpy cublasCaxpy_v2
+#define cublasZaxpy cublasZaxpy_v2
+
+#define cublasScopy cublasScopy_v2
+#define cublasDcopy cublasDcopy_v2
+#define cublasCcopy cublasCcopy_v2
+#define cublasZcopy cublasZcopy_v2
+
+#define cublasSswap cublasSswap_v2
+#define cublasDswap cublasDswap_v2
+#define cublasCswap cublasCswap_v2
+#define cublasZswap cublasZswap_v2
+
+#define cublasIsamax cublasIsamax_v2
+#define cublasIdamax cublasIdamax_v2
+#define cublasIcamax cublasIcamax_v2
+#define cublasIzamax cublasIzamax_v2
+
+#define cublasIsamin cublasIsamin_v2
+#define cublasIdamin cublasIdamin_v2
+#define cublasIcamin cublasIcamin_v2
+#define cublasIzamin cublasIzamin_v2
+
+#define cublasSasum cublasSasum_v2
+#define cublasDasum cublasDasum_v2
+#define cublasScasum cublasScasum_v2
+#define cublasDzasum cublasDzasum_v2
+
+#define cublasSrot cublasSrot_v2
+#define cublasDrot cublasDrot_v2
+#define cublasCrot cublasCrot_v2
+#define cublasCsrot cublasCsrot_v2
+#define cublasZrot cublasZrot_v2
+#define cublasZdrot cublasZdrot_v2
+
+#define cublasSrotg cublasSrotg_v2
+#define cublasDrotg cublasDrotg_v2
+#define cublasCrotg cublasCrotg_v2
+#define cublasZrotg cublasZrotg_v2
+
+#define cublasSrotm cublasSrotm_v2
+#define cublasDrotm cublasDrotm_v2
+
+#define cublasSrotmg cublasSrotmg_v2
+#define cublasDrotmg cublasDrotmg_v2
+
+/* Blas2 Routines */
+
+#define cublasSgemv cublasSgemv_v2
+#define cublasDgemv cublasDgemv_v2
+#define cublasCgemv cublasCgemv_v2
+#define cublasZgemv cublasZgemv_v2
+
+#define cublasSgbmv cublasSgbmv_v2
+#define cublasDgbmv cublasDgbmv_v2
+#define cublasCgbmv cublasCgbmv_v2
+#define cublasZgbmv cublasZgbmv_v2
+
+#define cublasStrmv cublasStrmv_v2
+#define cublasDtrmv cublasDtrmv_v2
+#define cublasCtrmv cublasCtrmv_v2
+#define cublasZtrmv cublasZtrmv_v2
+
+#define cublasStbmv cublasStbmv_v2
+#define cublasDtbmv cublasDtbmv_v2
+#define cublasCtbmv cublasCtbmv_v2
+#define cublasZtbmv cublasZtbmv_v2
+
+#define cublasStpmv cublasStpmv_v2
+#define cublasDtpmv cublasDtpmv_v2
+#define cublasCtpmv cublasCtpmv_v2
+#define cublasZtpmv cublasZtpmv_v2
+
+#define cublasStrsv cublasStrsv_v2
+#define cublasDtrsv cublasDtrsv_v2
+#define cublasCtrsv cublasCtrsv_v2
+#define cublasZtrsv cublasZtrsv_v2
+
+#define cublasStpsv cublasStpsv_v2
+#define cublasDtpsv cublasDtpsv_v2
+#define cublasCtpsv cublasCtpsv_v2
+#define cublasZtpsv cublasZtpsv_v2
+
+#define cublasStbsv cublasStbsv_v2
+#define cublasDtbsv cublasDtbsv_v2
+#define cublasCtbsv cublasCtbsv_v2
+#define cublasZtbsv cublasZtbsv_v2
+
+#define cublasSsymv cublasSsymv_v2
+#define cublasDsymv cublasDsymv_v2
+#define cublasCsymv cublasCsymv_v2
+#define cublasZsymv cublasZsymv_v2
+#define cublasChemv cublasChemv_v2
+#define cublasZhemv cublasZhemv_v2
+
+#define cublasSsbmv cublasSsbmv_v2
+#define cublasDsbmv cublasDsbmv_v2
+#define cublasChbmv cublasChbmv_v2
+#define cublasZhbmv cublasZhbmv_v2
+
+#define cublasSspmv cublasSspmv_v2
+#define cublasDspmv cublasDspmv_v2
+#define cublasChpmv cublasChpmv_v2
+#define cublasZhpmv cublasZhpmv_v2
+
+#define cublasSger cublasSger_v2
+#define cublasDger cublasDger_v2
+#define cublasCgeru cublasCgeru_v2
+#define cublasCgerc cublasCgerc_v2
+#define cublasZgeru cublasZgeru_v2
+#define cublasZgerc cublasZgerc_v2
+
+#define cublasSsyr cublasSsyr_v2
+#define cublasDsyr cublasDsyr_v2
+#define cublasCsyr cublasCsyr_v2
+#define cublasZsyr cublasZsyr_v2
+#define cublasCher cublasCher_v2
+#define cublasZher cublasZher_v2
+
+#define cublasSspr cublasSspr_v2
+#define cublasDspr cublasDspr_v2
+#define cublasChpr cublasChpr_v2
+#define cublasZhpr cublasZhpr_v2
+
+#define cublasSsyr2 cublasSsyr2_v2
+#define cublasDsyr2 cublasDsyr2_v2
+#define cublasCsyr2 cublasCsyr2_v2
+#define cublasZsyr2 cublasZsyr2_v2
+#define cublasCher2 cublasCher2_v2
+#define cublasZher2 cublasZher2_v2
+
+#define cublasSspr2 cublasSspr2_v2
+#define cublasDspr2 cublasDspr2_v2
+#define cublasChpr2 cublasChpr2_v2
+#define cublasZhpr2 cublasZhpr2_v2
+
+/* Blas3 Routines   */
+
+#define cublasSgemm cublasSgemm_v2
+#define cublasDgemm cublasDgemm_v2
+#define cublasCgemm cublasCgemm_v2
+#define cublasZgemm cublasZgemm_v2
+
+#define cublasSsyrk cublasSsyrk_v2
+#define cublasDsyrk cublasDsyrk_v2
+#define cublasCsyrk cublasCsyrk_v2
+#define cublasZsyrk cublasZsyrk_v2
+#define cublasCherk cublasCherk_v2
+#define cublasZherk cublasZherk_v2
+
+#define cublasSsyr2k cublasSsyr2k_v2
+#define cublasDsyr2k cublasDsyr2k_v2
+#define cublasCsyr2k cublasCsyr2k_v2
+#define cublasZsyr2k cublasZsyr2k_v2
+#define cublasCher2k cublasCher2k_v2
+#define cublasZher2k cublasZher2k_v2
+
+#define cublasSsymm cublasSsymm_v2
+#define cublasDsymm cublasDsymm_v2
+#define cublasCsymm cublasCsymm_v2
+#define cublasZsymm cublasZsymm_v2
+#define cublasChemm cublasChemm_v2
+#define cublasZhemm cublasZhemm_v2
+
+#define cublasStrsm cublasStrsm_v2
+#define cublasDtrsm cublasDtrsm_v2
+#define cublasCtrsm cublasCtrsm_v2
+#define cublasZtrsm cublasZtrsm_v2
+
+#define cublasStrmm cublasStrmm_v2
+#define cublasDtrmm cublasDtrmm_v2
+#define cublasCtrmm cublasCtrmm_v2
+#define cublasZtrmm cublasZtrmm_v2
+
+#endif /* !defined(CUBLAS_V2_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed965bb5d663bea7ef20593fb4de5cff86136cee
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h
@@ -0,0 +1,923 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PCSAMPLING_H_)
+#define _CUPTI_PCSAMPLING_H_
+
+#include <cuda.h>
+#include <stdint.h>
+#include <stddef.h>
+#include "cupti_result.h"
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API
+ * Functions, types, and enums that implement the CUPTI PC Sampling API.
+ * @{
+ */
+
+#ifndef CUPTI_PCSAMPLING_STRUCT_SIZE
+#define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CUPTI_STALL_REASON_STRING_SIZE
+#define CUPTI_STALL_REASON_STRING_SIZE                                            128
+#endif
+
+/**
+ * \brief PC Sampling collection mode
+ */
+typedef enum
+{
+  /**
+   * INVALID Value
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID                   = 0,
+  /**
+   * Continuous mode. Kernels are not serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS                = 1,
+  /**
+   * Serialized mode. Kernels are serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED         = 2,
+} CUpti_PCSamplingCollectionMode;
+
+/**
+ * \brief PC Sampling stall reasons
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [r] Collected stall reason index
+   */
+  uint32_t pcSamplingStallReasonIndex;
+  /**
+   * [r] Number of times the PC was sampled with the stallReason.
+   */
+  uint32_t samples;
+} CUpti_PCSamplingStallReason;
+
+/**
+ * \brief PC Sampling data
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [r] Unique cubin id
+   */
+  uint64_t cubinCrc;
+  /**
+   * [r] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+  /**
+   * Padding
+   */
+  uint32_t pad;
+  /**
+   * [r] The function name. This name string might be shared across all the records
+   * including records from activity APIs representing the same function, and so it should not be
+   * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to
+   * free the memory using free() function.
+   */
+  char* functionName;
+  /**
+   * [r] Collected stall reason count
+   */
+  size_t stallReasonCount;
+  /**
+   * [r] Stall reason id
+   * Total samples
+   */
+  CUpti_PCSamplingStallReason *stallReason;
+} CUpti_PCSamplingPCData;
+
+/**
+ * \brief PC Sampling output data format
+ */
+typedef enum
+{
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID          = 0,
+  /**
+   * HW buffer data will be parsed during collection of data
+   */
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED           = 1,
+} CUpti_PCSamplingOutputDataFormat;
+
+/**
+ * \brief Collected PC Sampling data
+ *
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Number of PCs to be collected
+   */
+  size_t collectNumPcs;
+  /**
+   * [r] Number of samples collected across all PCs.
+   * It includes samples for user modules, samples for non-user kernels and dropped samples.
+   * It includes counts for all non selected stall reasons.
+   * CUPTI does not provide PC records for non-user kernels.
+   * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero.
+   */
+  uint64_t totalSamples;
+  /**
+   * [r] Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * [r] Number of PCs collected
+   */
+  size_t totalNumPcs;
+  /**
+   * [r] Number of PCs available for collection
+   */
+  size_t remainingNumPcs;
+  /**
+   * [r] Unique identifier for each range.
+   * Data collected across multiple ranges in multiple buffers can be identified using range id.
+   */
+  uint64_t rangeId;
+  /**
+   * [r] Profiled PC data
+   * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs
+   */
+  CUpti_PCSamplingPCData *pPcData;
+  /**
+   * [r] Number of samples collected across all non user kernels PCs.
+   * It includes samples for non-user kernels.
+   * It includes counts for all non selected stall reasons as well.
+   * CUPTI does not provide PC records for non-user kernels.
+   */
+  uint64_t nonUsrKernelsTotalSamples;
+} CUpti_PCSamplingData;
+
+/**
+ * \brief PC Sampling configuration attributes
+ *
+ * PC Sampling configuration attribute types. These attributes can be read
+ * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written
+ * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked
+ * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute
+ * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute
+ * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and
+ * written using \ref cuptiPCSamplingSetConfigurationAttribute
+ */
+typedef enum
+{
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID                            = 0,
+  /**
+   * [rw] Sampling period for PC Sampling.
+   * DEFAULT - CUPTI defined value based on number of SMs
+   * Valid values for the sampling
+   * periods are between 5 to 31 both inclusive. This will set the
+   * sampling period to (2^samplingPeriod) cycles.
+   * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD                    = 1,
+  /**
+   * [w] Number of stall reasons to collect.
+   * DEFAULT - All stall reasons will be collected
+   * Value is a size_t
+   * [w] Stall reasons to collect
+   * DEFAULT - All stall reasons will be collected
+   * Input value should be a pointer pointing to array of stall reason indexes
+   * containing all the stall reason indexes to collect.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON                       = 2,
+  /**
+   * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer
+   * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs
+   * with all stall reasons
+   * Approximately it takes 16 Bytes (and some fixed size memory)
+   * to accommodate one PC with one stall reason
+   * For e.g. 1 PC with 1 stall reason = 32 Bytes
+   *          1 PC with 2 stall reason = 48 Bytes
+   *          1 PC with 4 stall reason = 96 Bytes
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE                = 3,
+  /**
+   * [rw] Size of HW buffer in bytes
+   * DEFAULT - 512 MB
+   * If sampling period is too less, HW buffer can overflow
+   * and drop PC data
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE               = 4,
+  /**
+   * [rw] PC Sampling collection mode
+   * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS
+   * Input value should be of type \ref CUpti_PCSamplingCollectionMode.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE                    = 5,
+  /**
+   * [rw] Control over PC Sampling data collection range
+   * Default - 0
+   * 1 - Allows user to start and stop PC Sampling using APIs -
+   * \ref cuptiPCSamplingStart() - Start PC Sampling
+   * \ref cuptiPCSamplingStop() - Stop PC Sampling
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL          = 6,
+  /**
+   * [w] Value for output data format
+   * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED
+   * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT                 = 7,
+  /**
+   * [w] Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Default - none.
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER               = 8,
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT                          = 0x7fffffff,
+} CUpti_PCSamplingConfigurationAttributeType;
+
+/**
+ * \brief PC sampling configuration information structure
+ *
+ * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured
+ * or queried for PC sampling configuration
+ */
+typedef struct
+{
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types
+   */
+  CUpti_PCSamplingConfigurationAttributeType attributeType;
+  /*
+   * Configure or query status for \p attributeType
+   * CUPTI_SUCCESS for valid \p attributeType and \p attributeData
+   * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid
+   * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid
+   */
+  CUptiResult attributeStatus;
+  union
+  {
+    /**
+     * Invalid Value
+     */
+    struct
+    {
+      uint64_t data[3];
+    } invalidData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+     */
+    struct
+    {
+      uint32_t samplingPeriod;
+    } samplingPeriodData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON
+     */
+    struct
+    {
+      size_t stallReasonCount;
+      uint32_t *pStallReasonIndex;
+    } stallReasonData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t scratchBufferSize;
+    } scratchBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t hardwareBufferSize;
+    } hardwareBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE
+     */
+    struct
+    {
+      CUpti_PCSamplingCollectionMode collectionMode;
+    } collectionModeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+     */
+    struct
+    {
+      uint32_t enableStartStopControl;
+    } enableStartStopControlData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT
+     */
+    struct
+    {
+      CUpti_PCSamplingOutputDataFormat outputDataFormat;
+    } outputDataFormatData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER
+     */
+    struct
+    {
+      void *samplingDataBuffer;
+    } samplingDataBufferData;
+  } attributeData;
+} CUpti_PCSamplingConfigurationInfo;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute
+ * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query
+   * using \ref cuptiPCSamplingGetConfigurationAttribute
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+} CUpti_PCSamplingConfigurationInfoParams;
+#define CUpti_PCSamplingConfigurationInfoParamsSize                 CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo)
+
+/**
+ * \brief Write PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid \p attrib.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Read PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid attribute.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that
+ * the \p value buffer is too small to hold the attribute value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  void *pcSamplingData;
+} CUpti_PCSamplingGetDataParams;
+#define CUpti_PCSamplingGetDataParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData)
+/**
+ * \brief Flush GPU PC sampling data periodically.
+ *
+ * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs:
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends
+ * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+ * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop()
+ *
+ * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload,
+ * user can collect data in two ways:
+ * Use \brief cuptiPCSamplingGetData() API periodically
+ * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling
+ * data buffer passed during configuration.
+ * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer
+ * passed during configuration should be large enough to hold all PCs data.
+ *       \brief cuptiPCSamplingGetData() API never does device synchronization.
+ *       It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case
+ * CUPTI provides only the data available with it at that moment.
+ *
+ * \param Refer \ref CUpti_PCSamplingGetDataParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without
+ * enabling PC sampling.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingEnableParams;
+#define CUpti_PCSamplingEnableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx)
+
+/**
+ * \brief Enable PC sampling.
+ *
+ * \param Refer \ref CUpti_PCSamplingEnableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingDisable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingDisableParams;
+#define CUpti_PCSamplingDisableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx)
+
+/**
+ * \brief Disable PC sampling.
+ *
+ * For application which doesn't destroy the CUDA context explicitly,
+ * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided
+ * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded.
+ *
+ * \param Refer \ref CUpti_PCSamplingDisableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStart
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStartParams;
+#define CUpti_PCSamplingStartParamsSize                             CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx)
+
+/**
+ * \brief Start PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark starting of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param Refer \ref CUpti_PCSamplingStartParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStop
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStopParams;
+#define CUpti_PCSamplingStopParamsSize                              CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx)
+
+/**
+ * \brief Stop PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark end of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param Refer \ref CUpti_PCSamplingStopParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetNumStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [r] Number of stall reasons
+   */
+  size_t *numStallReasons;
+} CUpti_PCSamplingGetNumStallReasonsParams;
+#define CUpti_PCSamplingGetNumStallReasonsParamsSize                CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons)
+
+/**
+ * \brief Get PC sampling stall reason count.
+ *
+ * \param Refer \ref CUpti_PCSamplingGetNumStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * [r] Stall reason index
+   */
+  uint32_t *stallReasonIndex;
+  /**
+   * [r] Stall reasons name
+   */
+  char **stallReasons;
+} CUpti_PCSamplingGetStallReasonsParams;
+#define CUpti_PCSamplingGetStallReasonsParamsSize                   CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons)
+
+/**
+ * \brief Get PC sampling stall reasons.
+ *
+ * \param Refer \ref CUpti_PCSamplingGetStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
+
+/**
+ * \brief Params for cuptiGetSassToSourceCorrelation
+ */
+typedef struct {
+  /**
+   * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Pointer to cubin binary where function belongs.
+   */
+  const void* cubin;
+  /**
+   * [w] Function name to which PC belongs.
+   */
+  const char *functionName;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [r] Line number in the source code.
+   */
+  uint32_t lineNumber;
+  /**
+   * [w] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * [r] Path for the source file.
+   */
+  char *fileName;
+  /**
+   * [r] Path for the directory of source file.
+   */
+  char *dirName;
+} CUpti_GetSassToSourceCorrelationParams;
+#define CUpti_GetSassToSourceCorrelationParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName)
+
+/**
+ * \brief SASS to Source correlation.
+ *
+ * \param Refer \ref CUpti_GetSassToSourceCorrelationParams
+ *
+ * It is expected from user to free allocated memory for fileName and dirName after use.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName
+ * is NULL or cubinSize is zero or size field is not set correctly.
+ * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid.
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred.
+ * This error code is also used for cases when the function is not present in the module.
+ * A better error code will be returned in the future release.
+ */
+CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
+
+/**
+ * \brief Params for cuptiGetCubinCrc
+ */
+typedef struct {
+  /**
+   * [w] Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [w] Pointer to cubin binary
+   */
+  const void* cubin;
+  /**
+   * [r] Computed CRC will be stored in it.
+   */
+  uint64_t cubinCrc;
+} CUpti_GetCubinCrcParams;
+#define CUpti_GetCubinCrcParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc)
+
+/**
+ * \brief Get the CRC of cubin.
+ *
+ * This function returns the CRC of provided cubin binary.
+ *
+ * \param Refer \ref CUpti_GetCubinCrcParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or
+ * provided cubinSize is zero or size field is not set.
+ */
+CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams);
+
+/**
+ * \brief Function type for callback used by CUPTI to request crc of
+ * loaded module.
+ *
+ * This callback function ask for crc of provided module in function.
+ * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling
+ * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module.
+ *
+ * \param cubin The pointer to cubin binary
+ * \param cubinSize The size of cubin binary.
+ * \param cubinCrc Returns the computed crc of cubin.
+ */
+typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)(
+    const void* cubin,
+    size_t cubinSize,
+    uint64_t *cubinCrc);
+
+/**
+ * \brief Register callback function with CUPTI to use
+ * your own algorithm to compute cubin crc.
+ *
+ * This function registers a callback function and it gets called
+ * from CUPTI when a CUDA module is loaded.
+ *
+ * \param funcComputeCubinCrc callback is invoked when a CUDA module
+ * is loaded.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL.
+ */
+CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc);
+
+/** @} */ /* END CUPTI_PCSAMPLING_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_PCSAMPLING_H_*/
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86372ad53bf0e76bca82e189915b4989d29e2180
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h
new file mode 100644
index 0000000000000000000000000000000000000000..e24cfcbba4d93b57f15a4bd60fbe60a99b493f66
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h
@@ -0,0 +1,571 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_cnn_infer : cuDNN's basic definitions and inference CNN functions.
+ */
+
+#if !defined(CUDNN_CNN_INFER_H_)
+#define CUDNN_CNN_INFER_H_
+
+#pragma once
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_INFER_MAJOR 8
+#define CUDNN_CNN_INFER_MINOR 7
+#define CUDNN_CNN_INFER_PATCH 0
+
+#if (CUDNN_CNN_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_CNN_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t;
+
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t;
+
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t;
+
+/* Create an instance of convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+
+/* Destroy an instance of convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+
+/* helper function to provide the convolution forward algo that fit best the requirement */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform the forward pass for batch convolution */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+/* Helper function to calculate folding descriptors for dgrad */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t;
+
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t;
+
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t;
+
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t;
+
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t;
+
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t;
+
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnInferVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_CNN_INFER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6ff223dbf1791512913b378c42f3695cf9bb86a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * \file: The master cuDNN version file.
+ */
+
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+
+#define CUDNN_MAJOR 8
+#define CUDNN_MINOR 7
+#define CUDNN_PATCHLEVEL 0
+
+#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+
+#define CUDNN_MAX_SM_MAJOR_NUMBER 9
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100) + (CUDNN_MAX_SM_MINOR_NUMBER * 10)
+
+#endif /* CUDNN_VERSION_H */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..311087ab956a7e5226e441587728ed45a3ee2e03
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h
new file mode 100644
index 0000000000000000000000000000000000000000..94fcf4745fafa04f57678ba5ee64103f8ebd6444
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h
@@ -0,0 +1,97 @@
+ /* Copyright 2013,2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cudalibxt.h  
+* \brief Public header file for the NVIDIA library multi-GPU support structures  
+*/ 
+
+#ifndef _CUDA_LIB_XT_H_
+#define _CUDA_LIB_XT_H_
+#include <cuda_runtime.h>
+
+#define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION
+
+enum cudaXtCopyType_t {
+    LIB_XT_COPY_HOST_TO_DEVICE,
+    LIB_XT_COPY_DEVICE_TO_HOST,
+    LIB_XT_COPY_DEVICE_TO_DEVICE
+} ;
+typedef enum cudaXtCopyType_t cudaLibXtCopyType;
+
+enum libFormat_t {
+    LIB_FORMAT_CUFFT        = 0x0,
+    LIB_FORMAT_UNDEFINED    = 0x1
+};
+
+typedef enum libFormat_t libFormat;
+
+#define MAX_CUDA_DESCRIPTOR_GPUS 64
+
+struct cudaXtDesc_t{
+    int version;                             //descriptor version
+    int nGPUs;                               //number of GPUs 
+    int GPUs[MAX_CUDA_DESCRIPTOR_GPUS];      //array of device IDs
+    void *data[MAX_CUDA_DESCRIPTOR_GPUS];    //array of pointers to data, one per GPU
+    size_t size[MAX_CUDA_DESCRIPTOR_GPUS];   //array of data sizes, one per GPU
+    void *cudaXtState;                       //opaque CUDA utility structure
+};
+typedef struct cudaXtDesc_t cudaXtDesc;
+
+struct cudaLibXtDesc_t{
+    int version;                //descriptor version
+    cudaXtDesc *descriptor;     //multi-GPU memory descriptor
+    libFormat library;          //which library recognizes the format
+    int subFormat;              //library specific enumerator of sub formats
+    void *libDescriptor;        //library specific descriptor e.g. FFT transform plan object
+};
+typedef struct cudaLibXtDesc_t cudaLibXtDesc;
+
+
+#endif
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h
new file mode 100644
index 0000000000000000000000000000000000000000..511f5c7445d2f5f4bf9b84ebd766099b41837627
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h
@@ -0,0 +1,269 @@
+
+ /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufftXt.h
+* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
+*/
+
+#ifndef _CUFFTXT_H_
+#define _CUFFTXT_H_
+#include "cudalibxt.h"
+#include "cufft.h"
+
+
+#ifndef CUFFTAPI
+#ifdef _WIN32
+#define CUFFTAPI __stdcall
+#else
+#define CUFFTAPI
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// cufftXtSubFormat identifies the data layout of
+// a memory descriptor owned by cufft.
+// note that multi GPU cufft does not yet support out-of-place transforms
+//
+
+typedef enum cufftXtSubFormat_t {
+    CUFFT_XT_FORMAT_INPUT = 0x00,              //by default input is in linear order across GPUs
+    CUFFT_XT_FORMAT_OUTPUT = 0x01,             //by default output is in scrambled order depending on transform
+    CUFFT_XT_FORMAT_INPLACE = 0x02,            //by default inplace is input order, which is linear across GPUs
+    CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   //shuffled output order after execution of the transform
+    CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  //shuffled input order prior to execution of 1D transforms
+    CUFFT_XT_FORMAT_DISTRIBUTED_INPUT = 0x05,
+    CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT = 0x06,
+    CUFFT_FORMAT_UNDEFINED = 0x07
+} cufftXtSubFormat;
+
+//
+// cufftXtCopyType specifies the type of copy for cufftXtMemcpy
+//
+typedef enum cufftXtCopyType_t {
+    CUFFT_COPY_HOST_TO_DEVICE = 0x00,
+    CUFFT_COPY_DEVICE_TO_HOST = 0x01,
+    CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
+    CUFFT_COPY_UNDEFINED = 0x03
+} cufftXtCopyType;
+
+//
+// cufftXtQueryType specifies the type of query for cufftXtQueryPlan
+//
+typedef enum cufftXtQueryType_t {
+    CUFFT_QUERY_1D_FACTORS = 0x00,
+    CUFFT_QUERY_UNDEFINED = 0x01
+} cufftXtQueryType;
+
+typedef struct cufftXt1dFactors_t {
+    long long int size;
+    long long int stringCount;
+    long long int stringLength;
+    long long int substringLength;
+    long long int factor1;
+    long long int factor2;
+    long long int stringMask;
+    long long int substringMask;
+    long long int factor1Mask;
+    long long int factor2Mask;
+    int stringShift;
+    int substringShift;
+    int factor1Shift;
+    int factor2Shift;
+} cufftXt1dFactors;
+
+//
+// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
+//
+typedef enum cufftXtWorkAreaPolicy_t {
+    CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
+    CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
+    CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
+} cufftXtWorkAreaPolicy;
+
+// multi-GPU routines
+cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);
+
+cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
+                                   cudaLibXtDesc ** descriptor,
+                                   cufftXtSubFormat format);
+
+cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
+                                   void *dstPointer,
+                                   void *srcPointer,
+                                   cufftXtCopyType type);
+
+cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);
+
+cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output,
+                                              int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output,
+                                              int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+// Utility functions
+
+cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);
+
+
+// callbacks
+
+
+typedef enum cufftXtCallbackType_t {
+    CUFFT_CB_LD_COMPLEX = 0x0,
+    CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
+    CUFFT_CB_LD_REAL = 0x2,
+    CUFFT_CB_LD_REAL_DOUBLE = 0x3,
+    CUFFT_CB_ST_COMPLEX = 0x4,
+    CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
+    CUFFT_CB_ST_REAL = 0x6,
+    CUFFT_CB_ST_REAL_DOUBLE = 0x7,
+    CUFFT_CB_UNDEFINED = 0x8
+
+} cufftXtCallbackType;
+
+typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+
+typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);
+
+
+cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
+cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
+cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);
+
+cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
+                                         int rank,
+                                         long long int *n,
+                                         long long int *inembed,
+                                         long long int istride,
+                                         long long int idist,
+                                         cudaDataType inputtype,
+                                         long long int *onembed,
+                                         long long int ostride,
+                                         long long int odist,
+                                         cudaDataType outputtype,
+                                         long long int batch,
+                                         size_t *workSize,
+                                       	 cudaDataType executiontype);
+
+cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
+                                        int rank,
+                                        long long int *n,
+                                        long long int *inembed,
+                                        long long int istride,
+                                        long long int idist,
+                                        cudaDataType inputtype,
+                                        long long int *onembed,
+                                        long long int ostride,
+                                        long long int odist,
+                                        cudaDataType outputtype,
+                                        long long int batch,
+                                        size_t *workSize,
+                                        cudaDataType executiontype);
+
+
+cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
+                                 void *input,
+                                 void *output,
+                                 int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
+                                           cudaLibXtDesc *input,
+                                           cudaLibXtDesc *output,
+                                           int direction);
+
+cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);
+
+typedef struct cufftBox3d_t {
+    size_t lower[3];
+    size_t upper[3];
+    size_t strides[3];
+} cufftBox3d;
+
+cufftResult CUFFTAPI cufftXtSetDistribution(cufftHandle plan,
+                                            const cufftBox3d *box_in,
+                                            const cufftBox3d *box_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..b33560afbd4186ef1f1cab5be9f46b014e6bba0a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA
@@ -0,0 +1,35 @@
+Metadata-Version: 2.1
+Name: nvidia-nvtx-cu11
+Version: 11.8.86
+Summary: NVIDIA Tools Extension
+Home-page: https://developer.nvidia.com/cuda-zone
+Author: Nvidia CUDA Installer Team
+Author-email: cuda_installer@nvidia.com
+License: NVIDIA Proprietary Software
+Keywords: cuda,nvidia,runtime,machine learning,deep learning
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: Other/Proprietary License
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX :: Linux
+Requires-Python: >=3
+License-File: License.txt
+
+A C-based API for annotating events, code ranges, and resources in your applications. Applications which integrate NVTX can use the Visual Profiler to capture and visualize these events and ranges.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..06e355fe0e3ed7077903f119ae6928a17da8eb6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.37.1)
+Root-Is-Purelib: true
+Tag: py3-none-manylinux1_x86_64
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf3ac7b3d5071040de6d0c23ea365a6b5f46d613
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1238c06eab95f8c90c393383a703aa3b8c366a5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py
@@ -0,0 +1,354 @@
+"""Handwritten parser of dependency specifiers.
+
+The docstring for each __parse_* function contains EBNF-inspired grammar representing
+the implementation.
+"""
+
+from __future__ import annotations
+
+import ast
+from typing import NamedTuple, Sequence, Tuple, Union
+
+from ._tokenizer import DEFAULT_RULES, Tokenizer
+
+
+class Node:
+    def __init__(self, value: str) -> None:
+        self.value = value
+
+    def __str__(self) -> str:
+        return self.value
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}('{self}')>"
+
+    def serialize(self) -> str:
+        raise NotImplementedError
+
+
+class Variable(Node):
+    def serialize(self) -> str:
+        return str(self)
+
+
+class Value(Node):
+    def serialize(self) -> str:
+        return f'"{self}"'
+
+
+class Op(Node):
+    def serialize(self) -> str:
+        return str(self)
+
+
+MarkerVar = Union[Variable, Value]
+MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
+MarkerAtom = Union[MarkerItem, Sequence["MarkerAtom"]]
+MarkerList = Sequence[Union["MarkerList", MarkerAtom, str]]
+
+
+class ParsedRequirement(NamedTuple):
+    name: str
+    url: str
+    extras: list[str]
+    specifier: str
+    marker: MarkerList | None
+
+
+# --------------------------------------------------------------------------------------
+# Recursive descent parser for dependency specifier
+# --------------------------------------------------------------------------------------
+def parse_requirement(source: str) -> ParsedRequirement:
+    return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
+
+
+def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement:
+    """
+    requirement = WS? IDENTIFIER WS? extras WS? requirement_details
+    """
+    tokenizer.consume("WS")
+
+    name_token = tokenizer.expect(
+        "IDENTIFIER", expected="package name at the start of dependency specifier"
+    )
+    name = name_token.text
+    tokenizer.consume("WS")
+
+    extras = _parse_extras(tokenizer)
+    tokenizer.consume("WS")
+
+    url, specifier, marker = _parse_requirement_details(tokenizer)
+    tokenizer.expect("END", expected="end of dependency specifier")
+
+    return ParsedRequirement(name, url, extras, specifier, marker)
+
+
+def _parse_requirement_details(
+    tokenizer: Tokenizer,
+) -> tuple[str, str, MarkerList | None]:
+    """
+    requirement_details = AT URL (WS requirement_marker?)?
+                        | specifier WS? (requirement_marker)?
+    """
+
+    specifier = ""
+    url = ""
+    marker = None
+
+    if tokenizer.check("AT"):
+        tokenizer.read()
+        tokenizer.consume("WS")
+
+        url_start = tokenizer.position
+        url = tokenizer.expect("URL", expected="URL after @").text
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+
+        tokenizer.expect("WS", expected="whitespace after URL")
+
+        # The input might end after whitespace.
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+
+        marker = _parse_requirement_marker(
+            tokenizer, span_start=url_start, after="URL and whitespace"
+        )
+    else:
+        specifier_start = tokenizer.position
+        specifier = _parse_specifier(tokenizer)
+        tokenizer.consume("WS")
+
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+
+        marker = _parse_requirement_marker(
+            tokenizer,
+            span_start=specifier_start,
+            after=(
+                "version specifier"
+                if specifier
+                else "name and no valid version specifier"
+            ),
+        )
+
+    return (url, specifier, marker)
+
+
+def _parse_requirement_marker(
+    tokenizer: Tokenizer, *, span_start: int, after: str
+) -> MarkerList:
+    """
+    requirement_marker = SEMICOLON marker WS?
+    """
+
+    if not tokenizer.check("SEMICOLON"):
+        tokenizer.raise_syntax_error(
+            f"Expected end or semicolon (after {after})",
+            span_start=span_start,
+        )
+    tokenizer.read()
+
+    marker = _parse_marker(tokenizer)
+    tokenizer.consume("WS")
+
+    return marker
+
+
+def _parse_extras(tokenizer: Tokenizer) -> list[str]:
+    """
+    extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)?
+    """
+    if not tokenizer.check("LEFT_BRACKET", peek=True):
+        return []
+
+    with tokenizer.enclosing_tokens(
+        "LEFT_BRACKET",
+        "RIGHT_BRACKET",
+        around="extras",
+    ):
+        tokenizer.consume("WS")
+        extras = _parse_extras_list(tokenizer)
+        tokenizer.consume("WS")
+
+    return extras
+
+
+def _parse_extras_list(tokenizer: Tokenizer) -> list[str]:
+    """
+    extras_list = identifier (wsp* ',' wsp* identifier)*
+    """
+    extras: list[str] = []
+
+    if not tokenizer.check("IDENTIFIER"):
+        return extras
+
+    extras.append(tokenizer.read().text)
+
+    while True:
+        tokenizer.consume("WS")
+        if tokenizer.check("IDENTIFIER", peek=True):
+            tokenizer.raise_syntax_error("Expected comma between extra names")
+        elif not tokenizer.check("COMMA"):
+            break
+
+        tokenizer.read()
+        tokenizer.consume("WS")
+
+        extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma")
+        extras.append(extra_token.text)
+
+    return extras
+
+
+def _parse_specifier(tokenizer: Tokenizer) -> str:
+    """
+    specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS
+              | WS? version_many WS?
+    """
+    with tokenizer.enclosing_tokens(
+        "LEFT_PARENTHESIS",
+        "RIGHT_PARENTHESIS",
+        around="version specifier",
+    ):
+        tokenizer.consume("WS")
+        parsed_specifiers = _parse_version_many(tokenizer)
+        tokenizer.consume("WS")
+
+    return parsed_specifiers
+
+
+def _parse_version_many(tokenizer: Tokenizer) -> str:
+    """
+    version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)?
+    """
+    parsed_specifiers = ""
+    while tokenizer.check("SPECIFIER"):
+        span_start = tokenizer.position
+        parsed_specifiers += tokenizer.read().text
+        if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True):
+            tokenizer.raise_syntax_error(
+                ".* suffix can only be used with `==` or `!=` operators",
+                span_start=span_start,
+                span_end=tokenizer.position + 1,
+            )
+        if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True):
+            tokenizer.raise_syntax_error(
+                "Local version label can only be used with `==` or `!=` operators",
+                span_start=span_start,
+                span_end=tokenizer.position,
+            )
+        tokenizer.consume("WS")
+        if not tokenizer.check("COMMA"):
+            break
+        parsed_specifiers += tokenizer.read().text
+        tokenizer.consume("WS")
+
+    return parsed_specifiers
+
+
+# --------------------------------------------------------------------------------------
+# Recursive descent parser for marker expression
+# --------------------------------------------------------------------------------------
+def parse_marker(source: str) -> MarkerList:
+    return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES))
+
+
+def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList:
+    retval = _parse_marker(tokenizer)
+    tokenizer.expect("END", expected="end of marker expression")
+    return retval
+
+
+def _parse_marker(tokenizer: Tokenizer) -> MarkerList:
+    """
+    marker = marker_atom (BOOLOP marker_atom)+
+    """
+    expression = [_parse_marker_atom(tokenizer)]
+    while tokenizer.check("BOOLOP"):
+        token = tokenizer.read()
+        expr_right = _parse_marker_atom(tokenizer)
+        expression.extend((token.text, expr_right))
+    return expression
+
+
+def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom:
+    """
+    marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS?
+                | WS? marker_item WS?
+    """
+
+    tokenizer.consume("WS")
+    if tokenizer.check("LEFT_PARENTHESIS", peek=True):
+        with tokenizer.enclosing_tokens(
+            "LEFT_PARENTHESIS",
+            "RIGHT_PARENTHESIS",
+            around="marker expression",
+        ):
+            tokenizer.consume("WS")
+            marker: MarkerAtom = _parse_marker(tokenizer)
+            tokenizer.consume("WS")
+    else:
+        marker = _parse_marker_item(tokenizer)
+    tokenizer.consume("WS")
+    return marker
+
+
+def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem:
+    """
+    marker_item = WS? marker_var WS? marker_op WS? marker_var WS?
+    """
+    tokenizer.consume("WS")
+    marker_var_left = _parse_marker_var(tokenizer)
+    tokenizer.consume("WS")
+    marker_op = _parse_marker_op(tokenizer)
+    tokenizer.consume("WS")
+    marker_var_right = _parse_marker_var(tokenizer)
+    tokenizer.consume("WS")
+    return (marker_var_left, marker_op, marker_var_right)
+
+
+def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar:
+    """
+    marker_var = VARIABLE | QUOTED_STRING
+    """
+    if tokenizer.check("VARIABLE"):
+        return process_env_var(tokenizer.read().text.replace(".", "_"))
+    elif tokenizer.check("QUOTED_STRING"):
+        return process_python_str(tokenizer.read().text)
+    else:
+        tokenizer.raise_syntax_error(
+            message="Expected a marker variable or quoted string"
+        )
+
+
+def process_env_var(env_var: str) -> Variable:
+    if env_var in ("platform_python_implementation", "python_implementation"):
+        return Variable("platform_python_implementation")
+    else:
+        return Variable(env_var)
+
+
+def process_python_str(python_str: str) -> Value:
+    value = ast.literal_eval(python_str)
+    return Value(str(value))
+
+
+def _parse_marker_op(tokenizer: Tokenizer) -> Op:
+    """
+    marker_op = IN | NOT IN | OP
+    """
+    if tokenizer.check("IN"):
+        tokenizer.read()
+        return Op("in")
+    elif tokenizer.check("NOT"):
+        tokenizer.read()
+        tokenizer.expect("WS", expected="whitespace after 'not'")
+        tokenizer.expect("IN", expected="'in' after 'not'")
+        return Op("not in")
+    elif tokenizer.check("OP"):
+        return Op(tokenizer.read().text)
+    else:
+        return tokenizer.raise_syntax_error(
+            "Expected marker operator, one of "
+            "<=, <, !=, ==, >=, >, ~=, ===, in, not in"
+        )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb7f49cf8cd43ffae71e3e8d15174d7536f9da02
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py
@@ -0,0 +1,331 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+
+from __future__ import annotations
+
+import operator
+import os
+import platform
+import sys
+from typing import Any, Callable, TypedDict, cast
+
+from ._parser import MarkerAtom, MarkerList, Op, Value, Variable
+from ._parser import parse_marker as _parse_marker
+from ._tokenizer import ParserSyntaxError
+from .specifiers import InvalidSpecifier, Specifier
+from .utils import canonicalize_name
+
+__all__ = [
+    "InvalidMarker",
+    "Marker",
+    "UndefinedComparison",
+    "UndefinedEnvironmentName",
+    "default_environment",
+]
+
+Operator = Callable[[str, str], bool]
+
+
+class InvalidMarker(ValueError):
+    """
+    An invalid marker was found, users should refer to PEP 508.
+    """
+
+
+class UndefinedComparison(ValueError):
+    """
+    An invalid operation was attempted on a value that doesn't support it.
+    """
+
+
+class UndefinedEnvironmentName(ValueError):
+    """
+    A name was attempted to be used that does not exist inside of the
+    environment.
+    """
+
+
+class Environment(TypedDict):
+    implementation_name: str
+    """The implementation's identifier, e.g. ``'cpython'``."""
+
+    implementation_version: str
+    """
+    The implementation's version, e.g. ``'3.13.0a2'`` for CPython 3.13.0a2, or
+    ``'7.3.13'`` for PyPy3.10 v7.3.13.
+    """
+
+    os_name: str
+    """
+    The value of :py:data:`os.name`. The name of the operating system dependent module
+    imported, e.g. ``'posix'``.
+    """
+
+    platform_machine: str
+    """
+    Returns the machine type, e.g. ``'i386'``.
+
+    An empty string if the value cannot be determined.
+    """
+
+    platform_release: str
+    """
+    The system's release, e.g. ``'2.2.0'`` or ``'NT'``.
+
+    An empty string if the value cannot be determined.
+    """
+
+    platform_system: str
+    """
+    The system/OS name, e.g. ``'Linux'``, ``'Windows'`` or ``'Java'``.
+
+    An empty string if the value cannot be determined.
+    """
+
+    platform_version: str
+    """
+    The system's release version, e.g. ``'#3 on degas'``.
+
+    An empty string if the value cannot be determined.
+    """
+
+    python_full_version: str
+    """
+    The Python version as string ``'major.minor.patchlevel'``.
+
+    Note that unlike the Python :py:data:`sys.version`, this value will always include
+    the patchlevel (it defaults to 0).
+    """
+
+    platform_python_implementation: str
+    """
+    A string identifying the Python implementation, e.g. ``'CPython'``.
+    """
+
+    python_version: str
+    """The Python version as string ``'major.minor'``."""
+
+    sys_platform: str
+    """
+    This string contains a platform identifier that can be used to append
+    platform-specific components to :py:data:`sys.path`, for instance.
+
+    For Unix systems, except on Linux and AIX, this is the lowercased OS name as
+    returned by ``uname -s`` with the first part of the version as returned by
+    ``uname -r`` appended, e.g. ``'sunos5'`` or ``'freebsd8'``, at the time when Python
+    was built.
+    """
+
+
+def _normalize_extra_values(results: Any) -> Any:
+    """
+    Normalize extra values.
+    """
+    if isinstance(results[0], tuple):
+        lhs, op, rhs = results[0]
+        if isinstance(lhs, Variable) and lhs.value == "extra":
+            normalized_extra = canonicalize_name(rhs.value)
+            rhs = Value(normalized_extra)
+        elif isinstance(rhs, Variable) and rhs.value == "extra":
+            normalized_extra = canonicalize_name(lhs.value)
+            lhs = Value(normalized_extra)
+        results[0] = lhs, op, rhs
+    return results
+
+
+def _format_marker(
+    marker: list[str] | MarkerAtom | str, first: bool | None = True
+) -> str:
+    assert isinstance(marker, (list, tuple, str))
+
+    # Sometimes we have a structure like [[...]] which is a single item list
+    # where the single item is itself it's own list. In that case we want skip
+    # the rest of this function so that we don't get extraneous () on the
+    # outside.
+    if (
+        isinstance(marker, list)
+        and len(marker) == 1
+        and isinstance(marker[0], (list, tuple))
+    ):
+        return _format_marker(marker[0])
+
+    if isinstance(marker, list):
+        inner = (_format_marker(m, first=False) for m in marker)
+        if first:
+            return " ".join(inner)
+        else:
+            return "(" + " ".join(inner) + ")"
+    elif isinstance(marker, tuple):
+        return " ".join([m.serialize() for m in marker])
+    else:
+        return marker
+
+
+_operators: dict[str, Operator] = {
+    "in": lambda lhs, rhs: lhs in rhs,
+    "not in": lambda lhs, rhs: lhs not in rhs,
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+
+
+def _eval_op(lhs: str, op: Op, rhs: str) -> bool:
+    try:
+        spec = Specifier("".join([op.serialize(), rhs]))
+    except InvalidSpecifier:
+        pass
+    else:
+        return spec.contains(lhs, prereleases=True)
+
+    oper: Operator | None = _operators.get(op.serialize())
+    if oper is None:
+        raise UndefinedComparison(f"Undefined {op!r} on {lhs!r} and {rhs!r}.")
+
+    return oper(lhs, rhs)
+
+
+def _normalize(*values: str, key: str) -> tuple[str, ...]:
+    # PEP 685 – Comparison of extra names for optional distribution dependencies
+    # https://peps.python.org/pep-0685/
+    # > When comparing extra names, tools MUST normalize the names being
+    # > compared using the semantics outlined in PEP 503 for names
+    if key == "extra":
+        return tuple(canonicalize_name(v) for v in values)
+
+    # other environment markers don't have such standards
+    return values
+
+
+def _evaluate_markers(markers: MarkerList, environment: dict[str, str]) -> bool:
+    groups: list[list[bool]] = [[]]
+
+    for marker in markers:
+        assert isinstance(marker, (list, tuple, str))
+
+        if isinstance(marker, list):
+            groups[-1].append(_evaluate_markers(marker, environment))
+        elif isinstance(marker, tuple):
+            lhs, op, rhs = marker
+
+            if isinstance(lhs, Variable):
+                environment_key = lhs.value
+                lhs_value = environment[environment_key]
+                rhs_value = rhs.value
+            else:
+                lhs_value = lhs.value
+                environment_key = rhs.value
+                rhs_value = environment[environment_key]
+
+            lhs_value, rhs_value = _normalize(lhs_value, rhs_value, key=environment_key)
+            groups[-1].append(_eval_op(lhs_value, op, rhs_value))
+        else:
+            assert marker in ["and", "or"]
+            if marker == "or":
+                groups.append([])
+
+    return any(all(item) for item in groups)
+
+
+def format_full_version(info: sys._version_info) -> str:
+    version = f"{info.major}.{info.minor}.{info.micro}"
+    kind = info.releaselevel
+    if kind != "final":
+        version += kind[0] + str(info.serial)
+    return version
+
+
+def default_environment() -> Environment:
+    iver = format_full_version(sys.implementation.version)
+    implementation_name = sys.implementation.name
+    return {
+        "implementation_name": implementation_name,
+        "implementation_version": iver,
+        "os_name": os.name,
+        "platform_machine": platform.machine(),
+        "platform_release": platform.release(),
+        "platform_system": platform.system(),
+        "platform_version": platform.version(),
+        "python_full_version": platform.python_version(),
+        "platform_python_implementation": platform.python_implementation(),
+        "python_version": ".".join(platform.python_version_tuple()[:2]),
+        "sys_platform": sys.platform,
+    }
+
+
+class Marker:
+    def __init__(self, marker: str) -> None:
+        # Note: We create a Marker object without calling this constructor in
+        #       packaging.requirements.Requirement. If any additional logic is
+        #       added here, make sure to mirror/adapt Requirement.
+        try:
+            self._markers = _normalize_extra_values(_parse_marker(marker))
+            # The attribute `_markers` can be described in terms of a recursive type:
+            # MarkerList = List[Union[Tuple[Node, ...], str, MarkerList]]
+            #
+            # For example, the following expression:
+            # python_version > "3.6" or (python_version == "3.6" and os_name == "unix")
+            #
+            # is parsed into:
+            # [
+            #     (<Variable('python_version')>, <Op('>')>, <Value('3.6')>),
+            #     'and',
+            #     [
+            #         (<Variable('python_version')>, <Op('==')>, <Value('3.6')>),
+            #         'or',
+            #         (<Variable('os_name')>, <Op('==')>, <Value('unix')>)
+            #     ]
+            # ]
+        except ParserSyntaxError as e:
+            raise InvalidMarker(str(e)) from e
+
+    def __str__(self) -> str:
+        return _format_marker(self._markers)
+
+    def __repr__(self) -> str:
+        return f"<Marker('{self}')>"
+
+    def __hash__(self) -> int:
+        return hash((self.__class__.__name__, str(self)))
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, Marker):
+            return NotImplemented
+
+        return str(self) == str(other)
+
+    def evaluate(self, environment: dict[str, str] | None = None) -> bool:
+        """Evaluate a marker.
+
+        Return the boolean from evaluating the given marker against the
+        environment. environment is an optional argument to override all or
+        part of the determined environment.
+
+        The environment is determined from the current Python process.
+        """
+        current_environment = cast("dict[str, str]", default_environment())
+        current_environment["extra"] = ""
+        if environment is not None:
+            current_environment.update(environment)
+            # The API used to allow setting extra to None. We need to handle this
+            # case for backwards compatibility.
+            if current_environment["extra"] is None:
+                current_environment["extra"] = ""
+
+        return _evaluate_markers(
+            self._markers, _repair_python_full_version(current_environment)
+        )
+
+
+def _repair_python_full_version(env: dict[str, str]) -> dict[str, str]:
+    """
+    Work around platform.python_version() returning something that is not PEP 440
+    compliant for non-tagged Python builds.
+    """
+    if env["python_full_version"].endswith("+"):
+        env["python_full_version"] += "local"
+    return env
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..721f411cfc44f6d24c13112e4246b5ad776a5e0b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py
@@ -0,0 +1,863 @@
+from __future__ import annotations
+
+import email.feedparser
+import email.header
+import email.message
+import email.parser
+import email.policy
+import pathlib
+import sys
+import typing
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Literal,
+    TypedDict,
+    cast,
+)
+
+from . import licenses, requirements, specifiers, utils
+from . import version as version_module
+from .licenses import NormalizedLicenseExpression
+
+T = typing.TypeVar("T")
+
+
+if sys.version_info >= (3, 11):  # pragma: no cover
+    ExceptionGroup = ExceptionGroup
+else:  # pragma: no cover
+
+    class ExceptionGroup(Exception):
+        """A minimal implementation of :external:exc:`ExceptionGroup` from Python 3.11.
+
+        If :external:exc:`ExceptionGroup` is already defined by Python itself,
+        that version is used instead.
+        """
+
+        message: str
+        exceptions: list[Exception]
+
+        def __init__(self, message: str, exceptions: list[Exception]) -> None:
+            self.message = message
+            self.exceptions = exceptions
+
+        def __repr__(self) -> str:
+            return f"{self.__class__.__name__}({self.message!r}, {self.exceptions!r})"
+
+
+class InvalidMetadata(ValueError):
+    """A metadata field contains invalid data."""
+
+    field: str
+    """The name of the field that contains invalid data."""
+
+    def __init__(self, field: str, message: str) -> None:
+        self.field = field
+        super().__init__(message)
+
+
+# The RawMetadata class attempts to make as few assumptions about the underlying
+# serialization formats as possible. The idea is that as long as a serialization
+# formats offer some very basic primitives in *some* way then we can support
+# serializing to and from that format.
+class RawMetadata(TypedDict, total=False):
+    """A dictionary of raw core metadata.
+
+    Each field in core metadata maps to a key of this dictionary (when data is
+    provided). The key is lower-case and underscores are used instead of dashes
+    compared to the equivalent core metadata field. Any core metadata field that
+    can be specified multiple times or can hold multiple values in a single
+    field have a key with a plural name. See :class:`Metadata` whose attributes
+    match the keys of this dictionary.
+
+    Core metadata fields that can be specified multiple times are stored as a
+    list or dict depending on which is appropriate for the field. Any fields
+    which hold multiple values in a single field are stored as a list.
+
+    """
+
+    # Metadata 1.0 - PEP 241
+    metadata_version: str
+    name: str
+    version: str
+    platforms: list[str]
+    summary: str
+    description: str
+    keywords: list[str]
+    home_page: str
+    author: str
+    author_email: str
+    license: str
+
+    # Metadata 1.1 - PEP 314
+    supported_platforms: list[str]
+    download_url: str
+    classifiers: list[str]
+    requires: list[str]
+    provides: list[str]
+    obsoletes: list[str]
+
+    # Metadata 1.2 - PEP 345
+    maintainer: str
+    maintainer_email: str
+    requires_dist: list[str]
+    provides_dist: list[str]
+    obsoletes_dist: list[str]
+    requires_python: str
+    requires_external: list[str]
+    project_urls: dict[str, str]
+
+    # Metadata 2.0
+    # PEP 426 attempted to completely revamp the metadata format
+    # but got stuck without ever being able to build consensus on
+    # it and ultimately ended up withdrawn.
+    #
+    # However, a number of tools had started emitting METADATA with
+    # `2.0` Metadata-Version, so for historical reasons, this version
+    # was skipped.
+
+    # Metadata 2.1 - PEP 566
+    description_content_type: str
+    provides_extra: list[str]
+
+    # Metadata 2.2 - PEP 643
+    dynamic: list[str]
+
+    # Metadata 2.3 - PEP 685
+    # No new fields were added in PEP 685, just some edge case were
+    # tightened up to provide better interoptability.
+
+    # Metadata 2.4 - PEP 639
+    license_expression: str
+    license_files: list[str]
+
+
+_STRING_FIELDS = {
+    "author",
+    "author_email",
+    "description",
+    "description_content_type",
+    "download_url",
+    "home_page",
+    "license",
+    "license_expression",
+    "maintainer",
+    "maintainer_email",
+    "metadata_version",
+    "name",
+    "requires_python",
+    "summary",
+    "version",
+}
+
+_LIST_FIELDS = {
+    "classifiers",
+    "dynamic",
+    "license_files",
+    "obsoletes",
+    "obsoletes_dist",
+    "platforms",
+    "provides",
+    "provides_dist",
+    "provides_extra",
+    "requires",
+    "requires_dist",
+    "requires_external",
+    "supported_platforms",
+}
+
+_DICT_FIELDS = {
+    "project_urls",
+}
+
+
+def _parse_keywords(data: str) -> list[str]:
+    """Split a string of comma-separated keywords into a list of keywords."""
+    return [k.strip() for k in data.split(",")]
+
+
+def _parse_project_urls(data: list[str]) -> dict[str, str]:
+    """Parse a list of label/URL string pairings separated by a comma."""
+    urls = {}
+    for pair in data:
+        # Our logic is slightly tricky here as we want to try and do
+        # *something* reasonable with malformed data.
+        #
+        # The main thing that we have to worry about, is data that does
+        # not have a ',' at all to split the label from the Value. There
+        # isn't a singular right answer here, and we will fail validation
+        # later on (if the caller is validating) so it doesn't *really*
+        # matter, but since the missing value has to be an empty str
+        # and our return value is dict[str, str], if we let the key
+        # be the missing value, then they'd have multiple '' values that
+        # overwrite each other in a accumulating dict.
+        #
+        # The other potentional issue is that it's possible to have the
+        # same label multiple times in the metadata, with no solid "right"
+        # answer with what to do in that case. As such, we'll do the only
+        # thing we can, which is treat the field as unparseable and add it
+        # to our list of unparsed fields.
+        parts = [p.strip() for p in pair.split(",", 1)]
+        parts.extend([""] * (max(0, 2 - len(parts))))  # Ensure 2 items
+
+        # TODO: The spec doesn't say anything about if the keys should be
+        #       considered case sensitive or not... logically they should
+        #       be case-preserving and case-insensitive, but doing that
+        #       would open up more cases where we might have duplicate
+        #       entries.
+        label, url = parts
+        if label in urls:
+            # The label already exists in our set of urls, so this field
+            # is unparseable, and we can just add the whole thing to our
+            # unparseable data and stop processing it.
+            raise KeyError("duplicate labels in project urls")
+        urls[label] = url
+
+    return urls
+
+
+def _get_payload(msg: email.message.Message, source: bytes | str) -> str:
+    """Get the body of the message."""
+    # If our source is a str, then our caller has managed encodings for us,
+    # and we don't need to deal with it.
+    if isinstance(source, str):
+        payload = msg.get_payload()
+        assert isinstance(payload, str)
+        return payload
+    # If our source is a bytes, then we're managing the encoding and we need
+    # to deal with it.
+    else:
+        bpayload = msg.get_payload(decode=True)
+        assert isinstance(bpayload, bytes)
+        try:
+            return bpayload.decode("utf8", "strict")
+        except UnicodeDecodeError as exc:
+            raise ValueError("payload in an invalid encoding") from exc
+
+
+# The various parse_FORMAT functions here are intended to be as lenient as
+# possible in their parsing, while still returning a correctly typed
+# RawMetadata.
+#
+# To aid in this, we also generally want to do as little touching of the
+# data as possible, except where there are possibly some historic holdovers
+# that make valid data awkward to work with.
+#
+# While this is a lower level, intermediate format than our ``Metadata``
+# class, some light touch ups can make a massive difference in usability.
+
+# Map METADATA fields to RawMetadata.
+_EMAIL_TO_RAW_MAPPING = {
+    "author": "author",
+    "author-email": "author_email",
+    "classifier": "classifiers",
+    "description": "description",
+    "description-content-type": "description_content_type",
+    "download-url": "download_url",
+    "dynamic": "dynamic",
+    "home-page": "home_page",
+    "keywords": "keywords",
+    "license": "license",
+    "license-expression": "license_expression",
+    "license-file": "license_files",
+    "maintainer": "maintainer",
+    "maintainer-email": "maintainer_email",
+    "metadata-version": "metadata_version",
+    "name": "name",
+    "obsoletes": "obsoletes",
+    "obsoletes-dist": "obsoletes_dist",
+    "platform": "platforms",
+    "project-url": "project_urls",
+    "provides": "provides",
+    "provides-dist": "provides_dist",
+    "provides-extra": "provides_extra",
+    "requires": "requires",
+    "requires-dist": "requires_dist",
+    "requires-external": "requires_external",
+    "requires-python": "requires_python",
+    "summary": "summary",
+    "supported-platform": "supported_platforms",
+    "version": "version",
+}
+_RAW_TO_EMAIL_MAPPING = {raw: email for email, raw in _EMAIL_TO_RAW_MAPPING.items()}
+
+
+def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[str, list[str]]]:
+    """Parse a distribution's metadata stored as email headers (e.g. from ``METADATA``).
+
+    This function returns a two-item tuple of dicts. The first dict is of
+    recognized fields from the core metadata specification. Fields that can be
+    parsed and translated into Python's built-in types are converted
+    appropriately. All other fields are left as-is. Fields that are allowed to
+    appear multiple times are stored as lists.
+
+    The second dict contains all other fields from the metadata. This includes
+    any unrecognized fields. It also includes any fields which are expected to
+    be parsed into a built-in type but were not formatted appropriately. Finally,
+    any fields that are expected to appear only once but are repeated are
+    included in this dict.
+
+    """
+    raw: dict[str, str | list[str] | dict[str, str]] = {}
+    unparsed: dict[str, list[str]] = {}
+
+    if isinstance(data, str):
+        parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data)
+    else:
+        parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data)
+
+    # We have to wrap parsed.keys() in a set, because in the case of multiple
+    # values for a key (a list), the key will appear multiple times in the
+    # list of keys, but we're avoiding that by using get_all().
+    for name in frozenset(parsed.keys()):
+        # Header names in RFC are case insensitive, so we'll normalize to all
+        # lower case to make comparisons easier.
+        name = name.lower()
+
+        # We use get_all() here, even for fields that aren't multiple use,
+        # because otherwise someone could have e.g. two Name fields, and we
+        # would just silently ignore it rather than doing something about it.
+        headers = parsed.get_all(name) or []
+
+        # The way the email module works when parsing bytes is that it
+        # unconditionally decodes the bytes as ascii using the surrogateescape
+        # handler. When you pull that data back out (such as with get_all() ),
+        # it looks to see if the str has any surrogate escapes, and if it does
+        # it wraps it in a Header object instead of returning the string.
+        #
+        # As such, we'll look for those Header objects, and fix up the encoding.
+        value = []
+        # Flag if we have run into any issues processing the headers, thus
+        # signalling that the data belongs in 'unparsed'.
+        valid_encoding = True
+        for h in headers:
+            # It's unclear if this can return more types than just a Header or
+            # a str, so we'll just assert here to make sure.
+            assert isinstance(h, (email.header.Header, str))
+
+            # If it's a header object, we need to do our little dance to get
+            # the real data out of it. In cases where there is invalid data
+            # we're going to end up with mojibake, but there's no obvious, good
+            # way around that without reimplementing parts of the Header object
+            # ourselves.
+            #
+            # That should be fine since, if mojibacked happens, this key is
+            # going into the unparsed dict anyways.
+            if isinstance(h, email.header.Header):
+                # The Header object stores it's data as chunks, and each chunk
+                # can be independently encoded, so we'll need to check each
+                # of them.
+                chunks: list[tuple[bytes, str | None]] = []
+                for bin, encoding in email.header.decode_header(h):
+                    try:
+                        bin.decode("utf8", "strict")
+                    except UnicodeDecodeError:
+                        # Enable mojibake.
+                        encoding = "latin1"
+                        valid_encoding = False
+                    else:
+                        encoding = "utf8"
+                    chunks.append((bin, encoding))
+
+                # Turn our chunks back into a Header object, then let that
+                # Header object do the right thing to turn them into a
+                # string for us.
+                value.append(str(email.header.make_header(chunks)))
+            # This is already a string, so just add it.
+            else:
+                value.append(h)
+
+        # We've processed all of our values to get them into a list of str,
+        # but we may have mojibake data, in which case this is an unparsed
+        # field.
+        if not valid_encoding:
+            unparsed[name] = value
+            continue
+
+        raw_name = _EMAIL_TO_RAW_MAPPING.get(name)
+        if raw_name is None:
+            # This is a bit of a weird situation, we've encountered a key that
+            # we don't know what it means, so we don't know whether it's meant
+            # to be a list or not.
+            #
+            # Since we can't really tell one way or another, we'll just leave it
+            # as a list, even though it may be a single item list, because that's
+            # what makes the most sense for email headers.
+            unparsed[name] = value
+            continue
+
+        # If this is one of our string fields, then we'll check to see if our
+        # value is a list of a single item. If it is then we'll assume that
+        # it was emitted as a single string, and unwrap the str from inside
+        # the list.
+        #
+        # If it's any other kind of data, then we haven't the faintest clue
+        # what we should parse it as, and we have to just add it to our list
+        # of unparsed stuff.
+        if raw_name in _STRING_FIELDS and len(value) == 1:
+            raw[raw_name] = value[0]
+        # If this is one of our list of string fields, then we can just assign
+        # the value, since email *only* has strings, and our get_all() call
+        # above ensures that this is a list.
+        elif raw_name in _LIST_FIELDS:
+            raw[raw_name] = value
+        # Special Case: Keywords
+        # The keywords field is implemented in the metadata spec as a str,
+        # but it conceptually is a list of strings, and is serialized using
+        # ", ".join(keywords), so we'll do some light data massaging to turn
+        # this into what it logically is.
+        elif raw_name == "keywords" and len(value) == 1:
+            raw[raw_name] = _parse_keywords(value[0])
+        # Special Case: Project-URL
+        # The project urls is implemented in the metadata spec as a list of
+        # specially-formatted strings that represent a key and a value, which
+        # is fundamentally a mapping, however the email format doesn't support
+        # mappings in a sane way, so it was crammed into a list of strings
+        # instead.
+        #
+        # We will do a little light data massaging to turn this into a map as
+        # it logically should be.
+        elif raw_name == "project_urls":
+            try:
+                raw[raw_name] = _parse_project_urls(value)
+            except KeyError:
+                unparsed[name] = value
+        # Nothing that we've done has managed to parse this, so it'll just
+        # throw it in our unparseable data and move on.
+        else:
+            unparsed[name] = value
+
+    # We need to support getting the Description from the message payload in
+    # addition to getting it from the the headers. This does mean, though, there
+    # is the possibility of it being set both ways, in which case we put both
+    # in 'unparsed' since we don't know which is right.
+    try:
+        payload = _get_payload(parsed, data)
+    except ValueError:
+        unparsed.setdefault("description", []).append(
+            parsed.get_payload(decode=isinstance(data, bytes))  # type: ignore[call-overload]
+        )
+    else:
+        if payload:
+            # Check to see if we've already got a description, if so then both
+            # it, and this body move to unparseable.
+            if "description" in raw:
+                description_header = cast(str, raw.pop("description"))
+                unparsed.setdefault("description", []).extend(
+                    [description_header, payload]
+                )
+            elif "description" in unparsed:
+                unparsed["description"].append(payload)
+            else:
+                raw["description"] = payload
+
+    # We need to cast our `raw` to a metadata, because a TypedDict only support
+    # literal key names, but we're computing our key names on purpose, but the
+    # way this function is implemented, our `TypedDict` can only have valid key
+    # names.
+    return cast(RawMetadata, raw), unparsed
+
+
+_NOT_FOUND = object()
+
+
+# Keep the two values in sync.
+_VALID_METADATA_VERSIONS = ["1.0", "1.1", "1.2", "2.1", "2.2", "2.3", "2.4"]
+_MetadataVersion = Literal["1.0", "1.1", "1.2", "2.1", "2.2", "2.3", "2.4"]
+
+_REQUIRED_ATTRS = frozenset(["metadata_version", "name", "version"])
+
+
+class _Validator(Generic[T]):
+    """Validate a metadata field.
+
+    All _process_*() methods correspond to a core metadata field. The method is
+    called with the field's raw value. If the raw value is valid it is returned
+    in its "enriched" form (e.g. ``version.Version`` for the ``Version`` field).
+    If the raw value is invalid, :exc:`InvalidMetadata` is raised (with a cause
+    as appropriate).
+    """
+
+    name: str
+    raw_name: str
+    added: _MetadataVersion
+
+    def __init__(
+        self,
+        *,
+        added: _MetadataVersion = "1.0",
+    ) -> None:
+        self.added = added
+
+    def __set_name__(self, _owner: Metadata, name: str) -> None:
+        self.name = name
+        self.raw_name = _RAW_TO_EMAIL_MAPPING[name]
+
+    def __get__(self, instance: Metadata, _owner: type[Metadata]) -> T:
+        # With Python 3.8, the caching can be replaced with functools.cached_property().
+        # No need to check the cache as attribute lookup will resolve into the
+        # instance's __dict__ before __get__ is called.
+        cache = instance.__dict__
+        value = instance._raw.get(self.name)
+
+        # To make the _process_* methods easier, we'll check if the value is None
+        # and if this field is NOT a required attribute, and if both of those
+        # things are true, we'll skip the the converter. This will mean that the
+        # converters never have to deal with the None union.
+        if self.name in _REQUIRED_ATTRS or value is not None:
+            try:
+                converter: Callable[[Any], T] = getattr(self, f"_process_{self.name}")
+            except AttributeError:
+                pass
+            else:
+                value = converter(value)
+
+        cache[self.name] = value
+        try:
+            del instance._raw[self.name]  # type: ignore[misc]
+        except KeyError:
+            pass
+
+        return cast(T, value)
+
+    def _invalid_metadata(
+        self, msg: str, cause: Exception | None = None
+    ) -> InvalidMetadata:
+        exc = InvalidMetadata(
+            self.raw_name, msg.format_map({"field": repr(self.raw_name)})
+        )
+        exc.__cause__ = cause
+        return exc
+
+    def _process_metadata_version(self, value: str) -> _MetadataVersion:
+        # Implicitly makes Metadata-Version required.
+        if value not in _VALID_METADATA_VERSIONS:
+            raise self._invalid_metadata(f"{value!r} is not a valid metadata version")
+        return cast(_MetadataVersion, value)
+
+    def _process_name(self, value: str) -> str:
+        if not value:
+            raise self._invalid_metadata("{field} is a required field")
+        # Validate the name as a side-effect.
+        try:
+            utils.canonicalize_name(value, validate=True)
+        except utils.InvalidName as exc:
+            raise self._invalid_metadata(
+                f"{value!r} is invalid for {{field}}", cause=exc
+            ) from exc
+        else:
+            return value
+
+    def _process_version(self, value: str) -> version_module.Version:
+        if not value:
+            raise self._invalid_metadata("{field} is a required field")
+        try:
+            return version_module.parse(value)
+        except version_module.InvalidVersion as exc:
+            raise self._invalid_metadata(
+                f"{value!r} is invalid for {{field}}", cause=exc
+            ) from exc
+
+    def _process_summary(self, value: str) -> str:
+        """Check the field contains no newlines."""
+        if "\n" in value:
+            raise self._invalid_metadata("{field} must be a single line")
+        return value
+
+    def _process_description_content_type(self, value: str) -> str:
+        content_types = {"text/plain", "text/x-rst", "text/markdown"}
+        message = email.message.EmailMessage()
+        message["content-type"] = value
+
+        content_type, parameters = (
+            # Defaults to `text/plain` if parsing failed.
+            message.get_content_type().lower(),
+            message["content-type"].params,
+        )
+        # Check if content-type is valid or defaulted to `text/plain` and thus was
+        # not parseable.
+        if content_type not in content_types or content_type not in value.lower():
+            raise self._invalid_metadata(
+                f"{{field}} must be one of {list(content_types)}, not {value!r}"
+            )
+
+        charset = parameters.get("charset", "UTF-8")
+        if charset != "UTF-8":
+            raise self._invalid_metadata(
+                f"{{field}} can only specify the UTF-8 charset, not {list(charset)}"
+            )
+
+        markdown_variants = {"GFM", "CommonMark"}
+        variant = parameters.get("variant", "GFM")  # Use an acceptable default.
+        if content_type == "text/markdown" and variant not in markdown_variants:
+            raise self._invalid_metadata(
+                f"valid Markdown variants for {{field}} are {list(markdown_variants)}, "
+                f"not {variant!r}",
+            )
+        return value
+
+    def _process_dynamic(self, value: list[str]) -> list[str]:
+        for dynamic_field in map(str.lower, value):
+            if dynamic_field in {"name", "version", "metadata-version"}:
+                raise self._invalid_metadata(
+                    f"{dynamic_field!r} is not allowed as a dynamic field"
+                )
+            elif dynamic_field not in _EMAIL_TO_RAW_MAPPING:
+                raise self._invalid_metadata(
+                    f"{dynamic_field!r} is not a valid dynamic field"
+                )
+        return list(map(str.lower, value))
+
+    def _process_provides_extra(
+        self,
+        value: list[str],
+    ) -> list[utils.NormalizedName]:
+        normalized_names = []
+        try:
+            for name in value:
+                normalized_names.append(utils.canonicalize_name(name, validate=True))
+        except utils.InvalidName as exc:
+            raise self._invalid_metadata(
+                f"{name!r} is invalid for {{field}}", cause=exc
+            ) from exc
+        else:
+            return normalized_names
+
+    def _process_requires_python(self, value: str) -> specifiers.SpecifierSet:
+        try:
+            return specifiers.SpecifierSet(value)
+        except specifiers.InvalidSpecifier as exc:
+            raise self._invalid_metadata(
+                f"{value!r} is invalid for {{field}}", cause=exc
+            ) from exc
+
+    def _process_requires_dist(
+        self,
+        value: list[str],
+    ) -> list[requirements.Requirement]:
+        reqs = []
+        try:
+            for req in value:
+                reqs.append(requirements.Requirement(req))
+        except requirements.InvalidRequirement as exc:
+            raise self._invalid_metadata(
+                f"{req!r} is invalid for {{field}}", cause=exc
+            ) from exc
+        else:
+            return reqs
+
+    def _process_license_expression(
+        self, value: str
+    ) -> NormalizedLicenseExpression | None:
+        try:
+            return licenses.canonicalize_license_expression(value)
+        except ValueError as exc:
+            raise self._invalid_metadata(
+                f"{value!r} is invalid for {{field}}", cause=exc
+            ) from exc
+
+    def _process_license_files(self, value: list[str]) -> list[str]:
+        paths = []
+        for path in value:
+            if ".." in path:
+                raise self._invalid_metadata(
+                    f"{path!r} is invalid for {{field}}, "
+                    "parent directory indicators are not allowed"
+                )
+            if "*" in path:
+                raise self._invalid_metadata(
+                    f"{path!r} is invalid for {{field}}, paths must be resolved"
+                )
+            if (
+                pathlib.PurePosixPath(path).is_absolute()
+                or pathlib.PureWindowsPath(path).is_absolute()
+            ):
+                raise self._invalid_metadata(
+                    f"{path!r} is invalid for {{field}}, paths must be relative"
+                )
+            if pathlib.PureWindowsPath(path).as_posix() != path:
+                raise self._invalid_metadata(
+                    f"{path!r} is invalid for {{field}}, "
+                    "paths must use '/' delimiter"
+                )
+            paths.append(path)
+        return paths
+
+
+class Metadata:
+    """Representation of distribution metadata.
+
+    Compared to :class:`RawMetadata`, this class provides objects representing
+    metadata fields instead of only using built-in types. Any invalid metadata
+    will cause :exc:`InvalidMetadata` to be raised (with a
+    :py:attr:`~BaseException.__cause__` attribute as appropriate).
+    """
+
+    _raw: RawMetadata
+
+    @classmethod
+    def from_raw(cls, data: RawMetadata, *, validate: bool = True) -> Metadata:
+        """Create an instance from :class:`RawMetadata`.
+
+        If *validate* is true, all metadata will be validated. All exceptions
+        related to validation will be gathered and raised as an :class:`ExceptionGroup`.
+        """
+        ins = cls()
+        ins._raw = data.copy()  # Mutations occur due to caching enriched values.
+
+        if validate:
+            exceptions: list[Exception] = []
+            try:
+                metadata_version = ins.metadata_version
+                metadata_age = _VALID_METADATA_VERSIONS.index(metadata_version)
+            except InvalidMetadata as metadata_version_exc:
+                exceptions.append(metadata_version_exc)
+                metadata_version = None
+
+            # Make sure to check for the fields that are present, the required
+            # fields (so their absence can be reported).
+            fields_to_check = frozenset(ins._raw) | _REQUIRED_ATTRS
+            # Remove fields that have already been checked.
+            fields_to_check -= {"metadata_version"}
+
+            for key in fields_to_check:
+                try:
+                    if metadata_version:
+                        # Can't use getattr() as that triggers descriptor protocol which
+                        # will fail due to no value for the instance argument.
+                        try:
+                            field_metadata_version = cls.__dict__[key].added
+                        except KeyError:
+                            exc = InvalidMetadata(key, f"unrecognized field: {key!r}")
+                            exceptions.append(exc)
+                            continue
+                        field_age = _VALID_METADATA_VERSIONS.index(
+                            field_metadata_version
+                        )
+                        if field_age > metadata_age:
+                            field = _RAW_TO_EMAIL_MAPPING[key]
+                            exc = InvalidMetadata(
+                                field,
+                                f"{field} introduced in metadata version "
+                                f"{field_metadata_version}, not {metadata_version}",
+                            )
+                            exceptions.append(exc)
+                            continue
+                    getattr(ins, key)
+                except InvalidMetadata as exc:
+                    exceptions.append(exc)
+
+            if exceptions:
+                raise ExceptionGroup("invalid metadata", exceptions)
+
+        return ins
+
+    @classmethod
+    def from_email(cls, data: bytes | str, *, validate: bool = True) -> Metadata:
+        """Parse metadata from email headers.
+
+        If *validate* is true, the metadata will be validated. All exceptions
+        related to validation will be gathered and raised as an :class:`ExceptionGroup`.
+        """
+        raw, unparsed = parse_email(data)
+
+        if validate:
+            exceptions: list[Exception] = []
+            for unparsed_key in unparsed:
+                if unparsed_key in _EMAIL_TO_RAW_MAPPING:
+                    message = f"{unparsed_key!r} has invalid data"
+                else:
+                    message = f"unrecognized field: {unparsed_key!r}"
+                exceptions.append(InvalidMetadata(unparsed_key, message))
+
+            if exceptions:
+                raise ExceptionGroup("unparsed", exceptions)
+
+        try:
+            return cls.from_raw(raw, validate=validate)
+        except ExceptionGroup as exc_group:
+            raise ExceptionGroup(
+                "invalid or unparsed metadata", exc_group.exceptions
+            ) from None
+
+    metadata_version: _Validator[_MetadataVersion] = _Validator()
+    """:external:ref:`core-metadata-metadata-version`
+    (required; validated to be a valid metadata version)"""
+    # `name` is not normalized/typed to NormalizedName so as to provide access to
+    # the original/raw name.
+    name: _Validator[str] = _Validator()
+    """:external:ref:`core-metadata-name`
+    (required; validated using :func:`~packaging.utils.canonicalize_name` and its
+    *validate* parameter)"""
+    version: _Validator[version_module.Version] = _Validator()
+    """:external:ref:`core-metadata-version` (required)"""
+    dynamic: _Validator[list[str] | None] = _Validator(
+        added="2.2",
+    )
+    """:external:ref:`core-metadata-dynamic`
+    (validated against core metadata field names and lowercased)"""
+    platforms: _Validator[list[str] | None] = _Validator()
+    """:external:ref:`core-metadata-platform`"""
+    supported_platforms: _Validator[list[str] | None] = _Validator(added="1.1")
+    """:external:ref:`core-metadata-supported-platform`"""
+    summary: _Validator[str | None] = _Validator()
+    """:external:ref:`core-metadata-summary` (validated to contain no newlines)"""
+    description: _Validator[str | None] = _Validator()  # TODO 2.1: can be in body
+    """:external:ref:`core-metadata-description`"""
+    description_content_type: _Validator[str | None] = _Validator(added="2.1")
+    """:external:ref:`core-metadata-description-content-type` (validated)"""
+    keywords: _Validator[list[str] | None] = _Validator()
+    """:external:ref:`core-metadata-keywords`"""
+    home_page: _Validator[str | None] = _Validator()
+    """:external:ref:`core-metadata-home-page`"""
+    download_url: _Validator[str | None] = _Validator(added="1.1")
+    """:external:ref:`core-metadata-download-url`"""
+    author: _Validator[str | None] = _Validator()
+    """:external:ref:`core-metadata-author`"""
+    author_email: _Validator[str | None] = _Validator()
+    """:external:ref:`core-metadata-author-email`"""
+    maintainer: _Validator[str | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-maintainer`"""
+    maintainer_email: _Validator[str | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-maintainer-email`"""
+    license: _Validator[str | None] = _Validator()
+    """:external:ref:`core-metadata-license`"""
+    license_expression: _Validator[NormalizedLicenseExpression | None] = _Validator(
+        added="2.4"
+    )
+    """:external:ref:`core-metadata-license-expression`"""
+    license_files: _Validator[list[str] | None] = _Validator(added="2.4")
+    """:external:ref:`core-metadata-license-file`"""
+    classifiers: _Validator[list[str] | None] = _Validator(added="1.1")
+    """:external:ref:`core-metadata-classifier`"""
+    requires_dist: _Validator[list[requirements.Requirement] | None] = _Validator(
+        added="1.2"
+    )
+    """:external:ref:`core-metadata-requires-dist`"""
+    requires_python: _Validator[specifiers.SpecifierSet | None] = _Validator(
+        added="1.2"
+    )
+    """:external:ref:`core-metadata-requires-python`"""
+    # Because `Requires-External` allows for non-PEP 440 version specifiers, we
+    # don't do any processing on the values.
+    requires_external: _Validator[list[str] | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-requires-external`"""
+    project_urls: _Validator[dict[str, str] | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-project-url`"""
+    # PEP 685 lets us raise an error if an extra doesn't pass `Name` validation
+    # regardless of metadata version.
+    provides_extra: _Validator[list[utils.NormalizedName] | None] = _Validator(
+        added="2.1",
+    )
+    """:external:ref:`core-metadata-provides-extra`"""
+    provides_dist: _Validator[list[str] | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-provides-dist`"""
+    obsoletes_dist: _Validator[list[str] | None] = _Validator(added="1.2")
+    """:external:ref:`core-metadata-obsoletes-dist`"""
+    requires: _Validator[list[str] | None] = _Validator(added="1.1")
+    """``Requires`` (deprecated)"""
+    provides: _Validator[list[str] | None] = _Validator(added="1.1")
+    """``Provides`` (deprecated)"""
+    obsoletes: _Validator[list[str] | None] = _Validator(added="1.1")
+    """``Obsoletes`` (deprecated)"""
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30926af8bf4f47efe98eea44d5ded4cb6f7e07d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py
@@ -0,0 +1,1020 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+"""
+.. testsetup::
+
+    from packaging.specifiers import Specifier, SpecifierSet, InvalidSpecifier
+    from packaging.version import Version
+"""
+
+from __future__ import annotations
+
+import abc
+import itertools
+import re
+from typing import Callable, Iterable, Iterator, TypeVar, Union
+
+from .utils import canonicalize_version
+from .version import Version
+
+UnparsedVersion = Union[Version, str]
+UnparsedVersionVar = TypeVar("UnparsedVersionVar", bound=UnparsedVersion)
+CallableOperator = Callable[[Version, str], bool]
+
+
+def _coerce_version(version: UnparsedVersion) -> Version:
+    if not isinstance(version, Version):
+        version = Version(version)
+    return version
+
+
+class InvalidSpecifier(ValueError):
+    """
+    Raised when attempting to create a :class:`Specifier` with a specifier
+    string that is invalid.
+
+    >>> Specifier("lolwat")
+    Traceback (most recent call last):
+        ...
+    packaging.specifiers.InvalidSpecifier: Invalid specifier: 'lolwat'
+    """
+
+
+class BaseSpecifier(metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    def __str__(self) -> str:
+        """
+        Returns the str representation of this Specifier-like object. This
+        should be representative of the Specifier itself.
+        """
+
+    @abc.abstractmethod
+    def __hash__(self) -> int:
+        """
+        Returns a hash value for this Specifier-like object.
+        """
+
+    @abc.abstractmethod
+    def __eq__(self, other: object) -> bool:
+        """
+        Returns a boolean representing whether or not the two Specifier-like
+        objects are equal.
+
+        :param other: The other object to check against.
+        """
+
+    @property
+    @abc.abstractmethod
+    def prereleases(self) -> bool | None:
+        """Whether or not pre-releases as a whole are allowed.
+
+        This can be set to either ``True`` or ``False`` to explicitly enable or disable
+        prereleases or it can be set to ``None`` (the default) to use default semantics.
+        """
+
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        """Setter for :attr:`prereleases`.
+
+        :param value: The value to set.
+        """
+
+    @abc.abstractmethod
+    def contains(self, item: str, prereleases: bool | None = None) -> bool:
+        """
+        Determines if the given item is contained within this specifier.
+        """
+
+    @abc.abstractmethod
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """
+        Takes an iterable of items and filters them so that only items which
+        are contained within this specifier are allowed in it.
+        """
+
+
+class Specifier(BaseSpecifier):
+    """This class abstracts handling of version specifiers.
+
+    .. tip::
+
+        It is generally not required to instantiate this manually. You should instead
+        prefer to work with :class:`SpecifierSet` instead, which can parse
+        comma-separated version specifiers (which is what package metadata contains).
+    """
+
+    _operator_regex_str = r"""
+        (?P<operator>(~=|==|!=|<=|>=|<|>|===))
+        """
+    _version_regex_str = r"""
+        (?P<version>
+            (?:
+                # The identity operators allow for an escape hatch that will
+                # do an exact string match of the version you wish to install.
+                # This will not be parsed by PEP 440 and we cannot determine
+                # any semantic meaning from it. This operator is discouraged
+                # but included entirely as an escape hatch.
+                (?<====)  # Only match for the identity operator
+                \s*
+                [^\s;)]*  # The arbitrary version can be just about anything,
+                          # we match everything except for whitespace, a
+                          # semi-colon for marker support, and a closing paren
+                          # since versions can be enclosed in them.
+            )
+            |
+            (?:
+                # The (non)equality operators allow for wild card and local
+                # versions to be specified so we have to define these two
+                # operators separately to enable that.
+                (?<===|!=)            # Only match for equals and not equals
+
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)*   # release
+
+                # You cannot use a wild card and a pre-release, post-release, a dev or
+                # local version together so group them with a | and make them optional.
+                (?:
+                    \.\*  # Wild card syntax of .*
+                    |
+                    (?:                                  # pre release
+                        [-_\.]?
+                        (alpha|beta|preview|pre|a|b|c|rc)
+                        [-_\.]?
+                        [0-9]*
+                    )?
+                    (?:                                  # post release
+                        (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                    )?
+                    (?:[-_\.]?dev[-_\.]?[0-9]*)?         # dev release
+                    (?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)? # local
+                )?
+            )
+            |
+            (?:
+                # The compatible operator requires at least two digits in the
+                # release segment.
+                (?<=~=)               # Only match for the compatible operator
+
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)+   # release  (We have a + instead of a *)
+                (?:                   # pre release
+                    [-_\.]?
+                    (alpha|beta|preview|pre|a|b|c|rc)
+                    [-_\.]?
+                    [0-9]*
+                )?
+                (?:                                   # post release
+                    (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                )?
+                (?:[-_\.]?dev[-_\.]?[0-9]*)?          # dev release
+            )
+            |
+            (?:
+                # All other operators only allow a sub set of what the
+                # (non)equality operators do. Specifically they do not allow
+                # local versions to be specified nor do they allow the prefix
+                # matching wild cards.
+                (?<!==|!=|~=)         # We have special cases for these
+                                      # operators so we want to make sure they
+                                      # don't match here.
+
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)*   # release
+                (?:                   # pre release
+                    [-_\.]?
+                    (alpha|beta|preview|pre|a|b|c|rc)
+                    [-_\.]?
+                    [0-9]*
+                )?
+                (?:                                   # post release
+                    (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                )?
+                (?:[-_\.]?dev[-_\.]?[0-9]*)?          # dev release
+            )
+        )
+        """
+
+    _regex = re.compile(
+        r"^\s*" + _operator_regex_str + _version_regex_str + r"\s*$",
+        re.VERBOSE | re.IGNORECASE,
+    )
+
+    _operators = {
+        "~=": "compatible",
+        "==": "equal",
+        "!=": "not_equal",
+        "<=": "less_than_equal",
+        ">=": "greater_than_equal",
+        "<": "less_than",
+        ">": "greater_than",
+        "===": "arbitrary",
+    }
+
+    def __init__(self, spec: str = "", prereleases: bool | None = None) -> None:
+        """Initialize a Specifier instance.
+
+        :param spec:
+            The string representation of a specifier which will be parsed and
+            normalized before use.
+        :param prereleases:
+            This tells the specifier if it should accept prerelease versions if
+            applicable or not. The default of ``None`` will autodetect it from the
+            given specifiers.
+        :raises InvalidSpecifier:
+            If the given specifier is invalid (i.e. bad syntax).
+        """
+        match = self._regex.search(spec)
+        if not match:
+            raise InvalidSpecifier(f"Invalid specifier: {spec!r}")
+
+        self._spec: tuple[str, str] = (
+            match.group("operator").strip(),
+            match.group("version").strip(),
+        )
+
+        # Store whether or not this Specifier should accept prereleases
+        self._prereleases = prereleases
+
+    # https://github.com/python/mypy/pull/13475#pullrequestreview-1079784515
+    @property  # type: ignore[override]
+    def prereleases(self) -> bool:
+        # If there is an explicit prereleases set for this, then we'll just
+        # blindly use that.
+        if self._prereleases is not None:
+            return self._prereleases
+
+        # Look at all of our specifiers and determine if they are inclusive
+        # operators, and if they are if they are including an explicit
+        # prerelease.
+        operator, version = self._spec
+        if operator in ["==", ">=", "<=", "~=", "===", ">", "<"]:
+            # The == specifier can include a trailing .*, if it does we
+            # want to remove before parsing.
+            if operator == "==" and version.endswith(".*"):
+                version = version[:-2]
+
+            # Parse the version, and if it is a pre-release than this
+            # specifier allows pre-releases.
+            if Version(version).is_prerelease:
+                return True
+
+        return False
+
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        self._prereleases = value
+
+    @property
+    def operator(self) -> str:
+        """The operator of this specifier.
+
+        >>> Specifier("==1.2.3").operator
+        '=='
+        """
+        return self._spec[0]
+
+    @property
+    def version(self) -> str:
+        """The version of this specifier.
+
+        >>> Specifier("==1.2.3").version
+        '1.2.3'
+        """
+        return self._spec[1]
+
+    def __repr__(self) -> str:
+        """A representation of the Specifier that shows all internal state.
+
+        >>> Specifier('>=1.0.0')
+        <Specifier('>=1.0.0')>
+        >>> Specifier('>=1.0.0', prereleases=False)
+        <Specifier('>=1.0.0', prereleases=False)>
+        >>> Specifier('>=1.0.0', prereleases=True)
+        <Specifier('>=1.0.0', prereleases=True)>
+        """
+        pre = (
+            f", prereleases={self.prereleases!r}"
+            if self._prereleases is not None
+            else ""
+        )
+
+        return f"<{self.__class__.__name__}({str(self)!r}{pre})>"
+
+    def __str__(self) -> str:
+        """A string representation of the Specifier that can be round-tripped.
+
+        >>> str(Specifier('>=1.0.0'))
+        '>=1.0.0'
+        >>> str(Specifier('>=1.0.0', prereleases=False))
+        '>=1.0.0'
+        """
+        return "{}{}".format(*self._spec)
+
+    @property
+    def _canonical_spec(self) -> tuple[str, str]:
+        canonical_version = canonicalize_version(
+            self._spec[1],
+            strip_trailing_zero=(self._spec[0] != "~="),
+        )
+        return self._spec[0], canonical_version
+
+    def __hash__(self) -> int:
+        return hash(self._canonical_spec)
+
+    def __eq__(self, other: object) -> bool:
+        """Whether or not the two Specifier-like objects are equal.
+
+        :param other: The other object to check against.
+
+        The value of :attr:`prereleases` is ignored.
+
+        >>> Specifier("==1.2.3") == Specifier("== 1.2.3.0")
+        True
+        >>> (Specifier("==1.2.3", prereleases=False) ==
+        ...  Specifier("==1.2.3", prereleases=True))
+        True
+        >>> Specifier("==1.2.3") == "==1.2.3"
+        True
+        >>> Specifier("==1.2.3") == Specifier("==1.2.4")
+        False
+        >>> Specifier("==1.2.3") == Specifier("~=1.2.3")
+        False
+        """
+        if isinstance(other, str):
+            try:
+                other = self.__class__(str(other))
+            except InvalidSpecifier:
+                return NotImplemented
+        elif not isinstance(other, self.__class__):
+            return NotImplemented
+
+        return self._canonical_spec == other._canonical_spec
+
+    def _get_operator(self, op: str) -> CallableOperator:
+        operator_callable: CallableOperator = getattr(
+            self, f"_compare_{self._operators[op]}"
+        )
+        return operator_callable
+
+    def _compare_compatible(self, prospective: Version, spec: str) -> bool:
+        # Compatible releases have an equivalent combination of >= and ==. That
+        # is that ~=2.2 is equivalent to >=2.2,==2.*. This allows us to
+        # implement this in terms of the other specifiers instead of
+        # implementing it ourselves. The only thing we need to do is construct
+        # the other specifiers.
+
+        # We want everything but the last item in the version, but we want to
+        # ignore suffix segments.
+        prefix = _version_join(
+            list(itertools.takewhile(_is_not_suffix, _version_split(spec)))[:-1]
+        )
+
+        # Add the prefix notation to the end of our string
+        prefix += ".*"
+
+        return self._get_operator(">=")(prospective, spec) and self._get_operator("==")(
+            prospective, prefix
+        )
+
+    def _compare_equal(self, prospective: Version, spec: str) -> bool:
+        # We need special logic to handle prefix matching
+        if spec.endswith(".*"):
+            # In the case of prefix matching we want to ignore local segment.
+            normalized_prospective = canonicalize_version(
+                prospective.public, strip_trailing_zero=False
+            )
+            # Get the normalized version string ignoring the trailing .*
+            normalized_spec = canonicalize_version(spec[:-2], strip_trailing_zero=False)
+            # Split the spec out by bangs and dots, and pretend that there is
+            # an implicit dot in between a release segment and a pre-release segment.
+            split_spec = _version_split(normalized_spec)
+
+            # Split the prospective version out by bangs and dots, and pretend
+            # that there is an implicit dot in between a release segment and
+            # a pre-release segment.
+            split_prospective = _version_split(normalized_prospective)
+
+            # 0-pad the prospective version before shortening it to get the correct
+            # shortened version.
+            padded_prospective, _ = _pad_version(split_prospective, split_spec)
+
+            # Shorten the prospective version to be the same length as the spec
+            # so that we can determine if the specifier is a prefix of the
+            # prospective version or not.
+            shortened_prospective = padded_prospective[: len(split_spec)]
+
+            return shortened_prospective == split_spec
+        else:
+            # Convert our spec string into a Version
+            spec_version = Version(spec)
+
+            # If the specifier does not have a local segment, then we want to
+            # act as if the prospective version also does not have a local
+            # segment.
+            if not spec_version.local:
+                prospective = Version(prospective.public)
+
+            return prospective == spec_version
+
+    def _compare_not_equal(self, prospective: Version, spec: str) -> bool:
+        return not self._compare_equal(prospective, spec)
+
+    def _compare_less_than_equal(self, prospective: Version, spec: str) -> bool:
+        # NB: Local version identifiers are NOT permitted in the version
+        # specifier, so local version labels can be universally removed from
+        # the prospective version.
+        return Version(prospective.public) <= Version(spec)
+
+    def _compare_greater_than_equal(self, prospective: Version, spec: str) -> bool:
+        # NB: Local version identifiers are NOT permitted in the version
+        # specifier, so local version labels can be universally removed from
+        # the prospective version.
+        return Version(prospective.public) >= Version(spec)
+
+    def _compare_less_than(self, prospective: Version, spec_str: str) -> bool:
+        # Convert our spec to a Version instance, since we'll want to work with
+        # it as a version.
+        spec = Version(spec_str)
+
+        # Check to see if the prospective version is less than the spec
+        # version. If it's not we can short circuit and just return False now
+        # instead of doing extra unneeded work.
+        if not prospective < spec:
+            return False
+
+        # This special case is here so that, unless the specifier itself
+        # includes is a pre-release version, that we do not accept pre-release
+        # versions for the version mentioned in the specifier (e.g. <3.1 should
+        # not match 3.1.dev0, but should match 3.0.dev0).
+        if not spec.is_prerelease and prospective.is_prerelease:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+
+        # If we've gotten to here, it means that prospective version is both
+        # less than the spec version *and* it's not a pre-release of the same
+        # version in the spec.
+        return True
+
+    def _compare_greater_than(self, prospective: Version, spec_str: str) -> bool:
+        # Convert our spec to a Version instance, since we'll want to work with
+        # it as a version.
+        spec = Version(spec_str)
+
+        # Check to see if the prospective version is greater than the spec
+        # version. If it's not we can short circuit and just return False now
+        # instead of doing extra unneeded work.
+        if not prospective > spec:
+            return False
+
+        # This special case is here so that, unless the specifier itself
+        # includes is a post-release version, that we do not accept
+        # post-release versions for the version mentioned in the specifier
+        # (e.g. >3.1 should not match 3.0.post0, but should match 3.2.post0).
+        if not spec.is_postrelease and prospective.is_postrelease:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+
+        # Ensure that we do not allow a local version of the version mentioned
+        # in the specifier, which is technically greater than, to match.
+        if prospective.local is not None:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+
+        # If we've gotten to here, it means that prospective version is both
+        # greater than the spec version *and* it's not a pre-release of the
+        # same version in the spec.
+        return True
+
+    def _compare_arbitrary(self, prospective: Version, spec: str) -> bool:
+        return str(prospective).lower() == str(spec).lower()
+
+    def __contains__(self, item: str | Version) -> bool:
+        """Return whether or not the item is contained in this specifier.
+
+        :param item: The item to check for.
+
+        This is used for the ``in`` operator and behaves the same as
+        :meth:`contains` with no ``prereleases`` argument passed.
+
+        >>> "1.2.3" in Specifier(">=1.2.3")
+        True
+        >>> Version("1.2.3") in Specifier(">=1.2.3")
+        True
+        >>> "1.0.0" in Specifier(">=1.2.3")
+        False
+        >>> "1.3.0a1" in Specifier(">=1.2.3")
+        False
+        >>> "1.3.0a1" in Specifier(">=1.2.3", prereleases=True)
+        True
+        """
+        return self.contains(item)
+
+    def contains(self, item: UnparsedVersion, prereleases: bool | None = None) -> bool:
+        """Return whether or not the item is contained in this specifier.
+
+        :param item:
+            The item to check for, which can be a version string or a
+            :class:`Version` instance.
+        :param prereleases:
+            Whether or not to match prereleases with this Specifier. If set to
+            ``None`` (the default), it uses :attr:`prereleases` to determine
+            whether or not prereleases are allowed.
+
+        >>> Specifier(">=1.2.3").contains("1.2.3")
+        True
+        >>> Specifier(">=1.2.3").contains(Version("1.2.3"))
+        True
+        >>> Specifier(">=1.2.3").contains("1.0.0")
+        False
+        >>> Specifier(">=1.2.3").contains("1.3.0a1")
+        False
+        >>> Specifier(">=1.2.3", prereleases=True).contains("1.3.0a1")
+        True
+        >>> Specifier(">=1.2.3").contains("1.3.0a1", prereleases=True)
+        True
+        """
+
+        # Determine if prereleases are to be allowed or not.
+        if prereleases is None:
+            prereleases = self.prereleases
+
+        # Normalize item to a Version, this allows us to have a shortcut for
+        # "2.0" in Specifier(">=2")
+        normalized_item = _coerce_version(item)
+
+        # Determine if we should be supporting prereleases in this specifier
+        # or not, if we do not support prereleases than we can short circuit
+        # logic if this version is a prereleases.
+        if normalized_item.is_prerelease and not prereleases:
+            return False
+
+        # Actually do the comparison to determine if this item is contained
+        # within this Specifier or not.
+        operator_callable: CallableOperator = self._get_operator(self.operator)
+        return operator_callable(normalized_item, self.version)
+
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """Filter items in the given iterable, that match the specifier.
+
+        :param iterable:
+            An iterable that can contain version strings and :class:`Version` instances.
+            The items in the iterable will be filtered according to the specifier.
+        :param prereleases:
+            Whether or not to allow prereleases in the returned iterator. If set to
+            ``None`` (the default), it will be intelligently decide whether to allow
+            prereleases or not (based on the :attr:`prereleases` attribute, and
+            whether the only versions matching are prereleases).
+
+        This method is smarter than just ``filter(Specifier().contains, [...])``
+        because it implements the rule from :pep:`440` that a prerelease item
+        SHOULD be accepted if no other versions match the given specifier.
+
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.2.3", "1.3", Version("1.4")]))
+        ['1.2.3', '1.3', <Version('1.4')>]
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.5a1"]))
+        ['1.5a1']
+        >>> list(Specifier(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        >>> list(Specifier(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+        """
+
+        yielded = False
+        found_prereleases = []
+
+        kw = {"prereleases": prereleases if prereleases is not None else True}
+
+        # Attempt to iterate over all the values in the iterable and if any of
+        # them match, yield them.
+        for version in iterable:
+            parsed_version = _coerce_version(version)
+
+            if self.contains(parsed_version, **kw):
+                # If our version is a prerelease, and we were not set to allow
+                # prereleases, then we'll store it for later in case nothing
+                # else matches this specifier.
+                if parsed_version.is_prerelease and not (
+                    prereleases or self.prereleases
+                ):
+                    found_prereleases.append(version)
+                # Either this is not a prerelease, or we should have been
+                # accepting prereleases from the beginning.
+                else:
+                    yielded = True
+                    yield version
+
+        # Now that we've iterated over everything, determine if we've yielded
+        # any values, and if we have not and we have any prereleases stored up
+        # then we will go ahead and yield the prereleases.
+        if not yielded and found_prereleases:
+            for version in found_prereleases:
+                yield version
+
+
+_prefix_regex = re.compile(r"^([0-9]+)((?:a|b|c|rc)[0-9]+)$")
+
+
+def _version_split(version: str) -> list[str]:
+    """Split version into components.
+
+    The split components are intended for version comparison. The logic does
+    not attempt to retain the original version string, so joining the
+    components back with :func:`_version_join` may not produce the original
+    version string.
+    """
+    result: list[str] = []
+
+    epoch, _, rest = version.rpartition("!")
+    result.append(epoch or "0")
+
+    for item in rest.split("."):
+        match = _prefix_regex.search(item)
+        if match:
+            result.extend(match.groups())
+        else:
+            result.append(item)
+    return result
+
+
+def _version_join(components: list[str]) -> str:
+    """Join split version components into a version string.
+
+    This function assumes the input came from :func:`_version_split`, where the
+    first component must be the epoch (either empty or numeric), and all other
+    components numeric.
+    """
+    epoch, *rest = components
+    return f"{epoch}!{'.'.join(rest)}"
+
+
+def _is_not_suffix(segment: str) -> bool:
+    return not any(
+        segment.startswith(prefix) for prefix in ("dev", "a", "b", "rc", "post")
+    )
+
+
+def _pad_version(left: list[str], right: list[str]) -> tuple[list[str], list[str]]:
+    left_split, right_split = [], []
+
+    # Get the release segment of our versions
+    left_split.append(list(itertools.takewhile(lambda x: x.isdigit(), left)))
+    right_split.append(list(itertools.takewhile(lambda x: x.isdigit(), right)))
+
+    # Get the rest of our versions
+    left_split.append(left[len(left_split[0]) :])
+    right_split.append(right[len(right_split[0]) :])
+
+    # Insert our padding
+    left_split.insert(1, ["0"] * max(0, len(right_split[0]) - len(left_split[0])))
+    right_split.insert(1, ["0"] * max(0, len(left_split[0]) - len(right_split[0])))
+
+    return (
+        list(itertools.chain.from_iterable(left_split)),
+        list(itertools.chain.from_iterable(right_split)),
+    )
+
+
+class SpecifierSet(BaseSpecifier):
+    """This class abstracts handling of a set of version specifiers.
+
+    It can be passed a single specifier (``>=3.0``), a comma-separated list of
+    specifiers (``>=3.0,!=3.1``), or no specifier at all.
+    """
+
+    def __init__(
+        self,
+        specifiers: str | Iterable[Specifier] = "",
+        prereleases: bool | None = None,
+    ) -> None:
+        """Initialize a SpecifierSet instance.
+
+        :param specifiers:
+            The string representation of a specifier or a comma-separated list of
+            specifiers which will be parsed and normalized before use.
+            May also be an iterable of ``Specifier`` instances, which will be used
+            as is.
+        :param prereleases:
+            This tells the SpecifierSet if it should accept prerelease versions if
+            applicable or not. The default of ``None`` will autodetect it from the
+            given specifiers.
+
+        :raises InvalidSpecifier:
+            If the given ``specifiers`` are not parseable than this exception will be
+            raised.
+        """
+
+        if isinstance(specifiers, str):
+            # Split on `,` to break each individual specifier into its own item, and
+            # strip each item to remove leading/trailing whitespace.
+            split_specifiers = [s.strip() for s in specifiers.split(",") if s.strip()]
+
+            # Make each individual specifier a Specifier and save in a frozen set
+            # for later.
+            self._specs = frozenset(map(Specifier, split_specifiers))
+        else:
+            # Save the supplied specifiers in a frozen set.
+            self._specs = frozenset(specifiers)
+
+        # Store our prereleases value so we can use it later to determine if
+        # we accept prereleases or not.
+        self._prereleases = prereleases
+
+    @property
+    def prereleases(self) -> bool | None:
+        # If we have been given an explicit prerelease modifier, then we'll
+        # pass that through here.
+        if self._prereleases is not None:
+            return self._prereleases
+
+        # If we don't have any specifiers, and we don't have a forced value,
+        # then we'll just return None since we don't know if this should have
+        # pre-releases or not.
+        if not self._specs:
+            return None
+
+        # Otherwise we'll see if any of the given specifiers accept
+        # prereleases, if any of them do we'll return True, otherwise False.
+        return any(s.prereleases for s in self._specs)
+
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        self._prereleases = value
+
+    def __repr__(self) -> str:
+        """A representation of the specifier set that shows all internal state.
+
+        Note that the ordering of the individual specifiers within the set may not
+        match the input string.
+
+        >>> SpecifierSet('>=1.0.0,!=2.0.0')
+        <SpecifierSet('!=2.0.0,>=1.0.0')>
+        >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=False)
+        <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=False)>
+        >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=True)
+        <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=True)>
+        """
+        pre = (
+            f", prereleases={self.prereleases!r}"
+            if self._prereleases is not None
+            else ""
+        )
+
+        return f"<SpecifierSet({str(self)!r}{pre})>"
+
+    def __str__(self) -> str:
+        """A string representation of the specifier set that can be round-tripped.
+
+        Note that the ordering of the individual specifiers within the set may not
+        match the input string.
+
+        >>> str(SpecifierSet(">=1.0.0,!=1.0.1"))
+        '!=1.0.1,>=1.0.0'
+        >>> str(SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False))
+        '!=1.0.1,>=1.0.0'
+        """
+        return ",".join(sorted(str(s) for s in self._specs))
+
+    def __hash__(self) -> int:
+        return hash(self._specs)
+
+    def __and__(self, other: SpecifierSet | str) -> SpecifierSet:
+        """Return a SpecifierSet which is a combination of the two sets.
+
+        :param other: The other object to combine with.
+
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") & '<=2.0.0,!=2.0.1'
+        <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") & SpecifierSet('<=2.0.0,!=2.0.1')
+        <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
+        """
+        if isinstance(other, str):
+            other = SpecifierSet(other)
+        elif not isinstance(other, SpecifierSet):
+            return NotImplemented
+
+        specifier = SpecifierSet()
+        specifier._specs = frozenset(self._specs | other._specs)
+
+        if self._prereleases is None and other._prereleases is not None:
+            specifier._prereleases = other._prereleases
+        elif self._prereleases is not None and other._prereleases is None:
+            specifier._prereleases = self._prereleases
+        elif self._prereleases == other._prereleases:
+            specifier._prereleases = self._prereleases
+        else:
+            raise ValueError(
+                "Cannot combine SpecifierSets with True and False prerelease "
+                "overrides."
+            )
+
+        return specifier
+
+    def __eq__(self, other: object) -> bool:
+        """Whether or not the two SpecifierSet-like objects are equal.
+
+        :param other: The other object to check against.
+
+        The value of :attr:`prereleases` is ignored.
+
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> (SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False) ==
+        ...  SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True))
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == ">=1.0.0,!=1.0.1"
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.2")
+        False
+        """
+        if isinstance(other, (str, Specifier)):
+            other = SpecifierSet(str(other))
+        elif not isinstance(other, SpecifierSet):
+            return NotImplemented
+
+        return self._specs == other._specs
+
+    def __len__(self) -> int:
+        """Returns the number of specifiers in this specifier set."""
+        return len(self._specs)
+
+    def __iter__(self) -> Iterator[Specifier]:
+        """
+        Returns an iterator over all the underlying :class:`Specifier` instances
+        in this specifier set.
+
+        >>> sorted(SpecifierSet(">=1.0.0,!=1.0.1"), key=str)
+        [<Specifier('!=1.0.1')>, <Specifier('>=1.0.0')>]
+        """
+        return iter(self._specs)
+
+    def __contains__(self, item: UnparsedVersion) -> bool:
+        """Return whether or not the item is contained in this specifier.
+
+        :param item: The item to check for.
+
+        This is used for the ``in`` operator and behaves the same as
+        :meth:`contains` with no ``prereleases`` argument passed.
+
+        >>> "1.2.3" in SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> Version("1.2.3") in SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> "1.0.1" in SpecifierSet(">=1.0.0,!=1.0.1")
+        False
+        >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1")
+        False
+        >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True)
+        True
+        """
+        return self.contains(item)
+
+    def contains(
+        self,
+        item: UnparsedVersion,
+        prereleases: bool | None = None,
+        installed: bool | None = None,
+    ) -> bool:
+        """Return whether or not the item is contained in this SpecifierSet.
+
+        :param item:
+            The item to check for, which can be a version string or a
+            :class:`Version` instance.
+        :param prereleases:
+            Whether or not to match prereleases with this SpecifierSet. If set to
+            ``None`` (the default), it uses :attr:`prereleases` to determine
+            whether or not prereleases are allowed.
+
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.2.3")
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains(Version("1.2.3"))
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.0.1")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True).contains("1.3.0a1")
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1", prereleases=True)
+        True
+        """
+        # Ensure that our item is a Version instance.
+        if not isinstance(item, Version):
+            item = Version(item)
+
+        # Determine if we're forcing a prerelease or not, if we're not forcing
+        # one for this particular filter call, then we'll use whatever the
+        # SpecifierSet thinks for whether or not we should support prereleases.
+        if prereleases is None:
+            prereleases = self.prereleases
+
+        # We can determine if we're going to allow pre-releases by looking to
+        # see if any of the underlying items supports them. If none of them do
+        # and this item is a pre-release then we do not allow it and we can
+        # short circuit that here.
+        # Note: This means that 1.0.dev1 would not be contained in something
+        #       like >=1.0.devabc however it would be in >=1.0.debabc,>0.0.dev0
+        if not prereleases and item.is_prerelease:
+            return False
+
+        if installed and item.is_prerelease:
+            item = Version(item.base_version)
+
+        # We simply dispatch to the underlying specs here to make sure that the
+        # given version is contained within all of them.
+        # Note: This use of all() here means that an empty set of specifiers
+        #       will always return True, this is an explicit design decision.
+        return all(s.contains(item, prereleases=prereleases) for s in self._specs)
+
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """Filter items in the given iterable, that match the specifiers in this set.
+
+        :param iterable:
+            An iterable that can contain version strings and :class:`Version` instances.
+            The items in the iterable will be filtered according to the specifier.
+        :param prereleases:
+            Whether or not to allow prereleases in the returned iterator. If set to
+            ``None`` (the default), it will be intelligently decide whether to allow
+            prereleases or not (based on the :attr:`prereleases` attribute, and
+            whether the only versions matching are prereleases).
+
+        This method is smarter than just ``filter(SpecifierSet(...).contains, [...])``
+        because it implements the rule from :pep:`440` that a prerelease item
+        SHOULD be accepted if no other versions match the given specifier.
+
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", Version("1.4")]))
+        ['1.3', <Version('1.4')>]
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.5a1"]))
+        []
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        >>> list(SpecifierSet(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+
+        An "empty" SpecifierSet will filter items based on the presence of prerelease
+        versions in the set.
+
+        >>> list(SpecifierSet("").filter(["1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(SpecifierSet("").filter(["1.5a1"]))
+        ['1.5a1']
+        >>> list(SpecifierSet("", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+        >>> list(SpecifierSet("").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        """
+        # Determine if we're forcing a prerelease or not, if we're not forcing
+        # one for this particular filter call, then we'll use whatever the
+        # SpecifierSet thinks for whether or not we should support prereleases.
+        if prereleases is None:
+            prereleases = self.prereleases
+
+        # If we have any specifiers, then we want to wrap our iterable in the
+        # filter method for each one, this will act as a logical AND amongst
+        # each specifier.
+        if self._specs:
+            for spec in self._specs:
+                iterable = spec.filter(iterable, prereleases=bool(prereleases))
+            return iter(iterable)
+        # If we do not have any specifiers, then we need to have a rough filter
+        # which will filter out any pre-releases, unless there are no final
+        # releases.
+        else:
+            filtered: list[UnparsedVersionVar] = []
+            found_prereleases: list[UnparsedVersionVar] = []
+
+            for item in iterable:
+                parsed_version = _coerce_version(item)
+
+                # Store any item which is a pre-release for later unless we've
+                # already found a final version or we are accepting prereleases
+                if parsed_version.is_prerelease and not prereleases:
+                    if not filtered:
+                        found_prereleases.append(item)
+                else:
+                    filtered.append(item)
+
+            # If we've found no items except for pre-releases, then we'll go
+            # ahead and use the pre-releases
+            if not filtered and found_prereleases and prereleases is None:
+                return iter(found_prereleases)
+
+            return iter(filtered)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5903402abb5a0aed37bb23914f678ef7e34a554
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py
@@ -0,0 +1,617 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+
+from __future__ import annotations
+
+import logging
+import platform
+import re
+import struct
+import subprocess
+import sys
+import sysconfig
+from importlib.machinery import EXTENSION_SUFFIXES
+from typing import (
+    Iterable,
+    Iterator,
+    Sequence,
+    Tuple,
+    cast,
+)
+
+from . import _manylinux, _musllinux
+
+logger = logging.getLogger(__name__)
+
+PythonVersion = Sequence[int]
+AppleVersion = Tuple[int, int]
+
+INTERPRETER_SHORT_NAMES: dict[str, str] = {
+    "python": "py",  # Generic.
+    "cpython": "cp",
+    "pypy": "pp",
+    "ironpython": "ip",
+    "jython": "jy",
+}
+
+
+_32_BIT_INTERPRETER = struct.calcsize("P") == 4
+
+
+class Tag:
+    """
+    A representation of the tag triple for a wheel.
+
+    Instances are considered immutable and thus are hashable. Equality checking
+    is also supported.
+    """
+
+    __slots__ = ["_abi", "_hash", "_interpreter", "_platform"]
+
+    def __init__(self, interpreter: str, abi: str, platform: str) -> None:
+        self._interpreter = interpreter.lower()
+        self._abi = abi.lower()
+        self._platform = platform.lower()
+        # The __hash__ of every single element in a Set[Tag] will be evaluated each time
+        # that a set calls its `.disjoint()` method, which may be called hundreds of
+        # times when scanning a page of links for packages with tags matching that
+        # Set[Tag]. Pre-computing the value here produces significant speedups for
+        # downstream consumers.
+        self._hash = hash((self._interpreter, self._abi, self._platform))
+
+    @property
+    def interpreter(self) -> str:
+        return self._interpreter
+
+    @property
+    def abi(self) -> str:
+        return self._abi
+
+    @property
+    def platform(self) -> str:
+        return self._platform
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Tag):
+            return NotImplemented
+
+        return (
+            (self._hash == other._hash)  # Short-circuit ASAP for perf reasons.
+            and (self._platform == other._platform)
+            and (self._abi == other._abi)
+            and (self._interpreter == other._interpreter)
+        )
+
+    def __hash__(self) -> int:
+        return self._hash
+
+    def __str__(self) -> str:
+        return f"{self._interpreter}-{self._abi}-{self._platform}"
+
+    def __repr__(self) -> str:
+        return f"<{self} @ {id(self)}>"
+
+
+def parse_tag(tag: str) -> frozenset[Tag]:
+    """
+    Parses the provided tag (e.g. `py3-none-any`) into a frozenset of Tag instances.
+
+    Returning a set is required due to the possibility that the tag is a
+    compressed tag set.
+    """
+    tags = set()
+    interpreters, abis, platforms = tag.split("-")
+    for interpreter in interpreters.split("."):
+        for abi in abis.split("."):
+            for platform_ in platforms.split("."):
+                tags.add(Tag(interpreter, abi, platform_))
+    return frozenset(tags)
+
+
+def _get_config_var(name: str, warn: bool = False) -> int | str | None:
+    value: int | str | None = sysconfig.get_config_var(name)
+    if value is None and warn:
+        logger.debug(
+            "Config variable '%s' is unset, Python ABI tag may be incorrect", name
+        )
+    return value
+
+
+def _normalize_string(string: str) -> str:
+    return string.replace(".", "_").replace("-", "_").replace(" ", "_")
+
+
+def _is_threaded_cpython(abis: list[str]) -> bool:
+    """
+    Determine if the ABI corresponds to a threaded (`--disable-gil`) build.
+
+    The threaded builds are indicated by a "t" in the abiflags.
+    """
+    if len(abis) == 0:
+        return False
+    # expect e.g., cp313
+    m = re.match(r"cp\d+(.*)", abis[0])
+    if not m:
+        return False
+    abiflags = m.group(1)
+    return "t" in abiflags
+
+
+def _abi3_applies(python_version: PythonVersion, threading: bool) -> bool:
+    """
+    Determine if the Python version supports abi3.
+
+    PEP 384 was first implemented in Python 3.2. The threaded (`--disable-gil`)
+    builds do not support abi3.
+    """
+    return len(python_version) > 1 and tuple(python_version) >= (3, 2) and not threading
+
+
+def _cpython_abis(py_version: PythonVersion, warn: bool = False) -> list[str]:
+    py_version = tuple(py_version)  # To allow for version comparison.
+    abis = []
+    version = _version_nodot(py_version[:2])
+    threading = debug = pymalloc = ucs4 = ""
+    with_debug = _get_config_var("Py_DEBUG", warn)
+    has_refcount = hasattr(sys, "gettotalrefcount")
+    # Windows doesn't set Py_DEBUG, so checking for support of debug-compiled
+    # extension modules is the best option.
+    # https://github.com/pypa/pip/issues/3383#issuecomment-173267692
+    has_ext = "_d.pyd" in EXTENSION_SUFFIXES
+    if with_debug or (with_debug is None and (has_refcount or has_ext)):
+        debug = "d"
+    if py_version >= (3, 13) and _get_config_var("Py_GIL_DISABLED", warn):
+        threading = "t"
+    if py_version < (3, 8):
+        with_pymalloc = _get_config_var("WITH_PYMALLOC", warn)
+        if with_pymalloc or with_pymalloc is None:
+            pymalloc = "m"
+        if py_version < (3, 3):
+            unicode_size = _get_config_var("Py_UNICODE_SIZE", warn)
+            if unicode_size == 4 or (
+                unicode_size is None and sys.maxunicode == 0x10FFFF
+            ):
+                ucs4 = "u"
+    elif debug:
+        # Debug builds can also load "normal" extension modules.
+        # We can also assume no UCS-4 or pymalloc requirement.
+        abis.append(f"cp{version}{threading}")
+    abis.insert(0, f"cp{version}{threading}{debug}{pymalloc}{ucs4}")
+    return abis
+
+
+def cpython_tags(
+    python_version: PythonVersion | None = None,
+    abis: Iterable[str] | None = None,
+    platforms: Iterable[str] | None = None,
+    *,
+    warn: bool = False,
+) -> Iterator[Tag]:
+    """
+    Yields the tags for a CPython interpreter.
+
+    The tags consist of:
+    - cp<python_version>-<abi>-<platform>
+    - cp<python_version>-abi3-<platform>
+    - cp<python_version>-none-<platform>
+    - cp<less than python_version>-abi3-<platform>  # Older Python versions down to 3.2.
+
+    If python_version only specifies a major version then user-provided ABIs and
+    the 'none' ABItag will be used.
+
+    If 'abi3' or 'none' are specified in 'abis' then they will be yielded at
+    their normal position and not at the beginning.
+    """
+    if not python_version:
+        python_version = sys.version_info[:2]
+
+    interpreter = f"cp{_version_nodot(python_version[:2])}"
+
+    if abis is None:
+        if len(python_version) > 1:
+            abis = _cpython_abis(python_version, warn)
+        else:
+            abis = []
+    abis = list(abis)
+    # 'abi3' and 'none' are explicitly handled later.
+    for explicit_abi in ("abi3", "none"):
+        try:
+            abis.remove(explicit_abi)
+        except ValueError:
+            pass
+
+    platforms = list(platforms or platform_tags())
+    for abi in abis:
+        for platform_ in platforms:
+            yield Tag(interpreter, abi, platform_)
+
+    threading = _is_threaded_cpython(abis)
+    use_abi3 = _abi3_applies(python_version, threading)
+    if use_abi3:
+        yield from (Tag(interpreter, "abi3", platform_) for platform_ in platforms)
+    yield from (Tag(interpreter, "none", platform_) for platform_ in platforms)
+
+    if use_abi3:
+        for minor_version in range(python_version[1] - 1, 1, -1):
+            for platform_ in platforms:
+                version = _version_nodot((python_version[0], minor_version))
+                interpreter = f"cp{version}"
+                yield Tag(interpreter, "abi3", platform_)
+
+
+def _generic_abi() -> list[str]:
+    """
+    Return the ABI tag based on EXT_SUFFIX.
+    """
+    # The following are examples of `EXT_SUFFIX`.
+    # We want to keep the parts which are related to the ABI and remove the
+    # parts which are related to the platform:
+    # - linux:   '.cpython-310-x86_64-linux-gnu.so' => cp310
+    # - mac:     '.cpython-310-darwin.so'           => cp310
+    # - win:     '.cp310-win_amd64.pyd'             => cp310
+    # - win:     '.pyd'                             => cp37 (uses _cpython_abis())
+    # - pypy:    '.pypy38-pp73-x86_64-linux-gnu.so' => pypy38_pp73
+    # - graalpy: '.graalpy-38-native-x86_64-darwin.dylib'
+    #                                               => graalpy_38_native
+
+    ext_suffix = _get_config_var("EXT_SUFFIX", warn=True)
+    if not isinstance(ext_suffix, str) or ext_suffix[0] != ".":
+        raise SystemError("invalid sysconfig.get_config_var('EXT_SUFFIX')")
+    parts = ext_suffix.split(".")
+    if len(parts) < 3:
+        # CPython3.7 and earlier uses ".pyd" on Windows.
+        return _cpython_abis(sys.version_info[:2])
+    soabi = parts[1]
+    if soabi.startswith("cpython"):
+        # non-windows
+        abi = "cp" + soabi.split("-")[1]
+    elif soabi.startswith("cp"):
+        # windows
+        abi = soabi.split("-")[0]
+    elif soabi.startswith("pypy"):
+        abi = "-".join(soabi.split("-")[:2])
+    elif soabi.startswith("graalpy"):
+        abi = "-".join(soabi.split("-")[:3])
+    elif soabi:
+        # pyston, ironpython, others?
+        abi = soabi
+    else:
+        return []
+    return [_normalize_string(abi)]
+
+
+def generic_tags(
+    interpreter: str | None = None,
+    abis: Iterable[str] | None = None,
+    platforms: Iterable[str] | None = None,
+    *,
+    warn: bool = False,
+) -> Iterator[Tag]:
+    """
+    Yields the tags for a generic interpreter.
+
+    The tags consist of:
+    - <interpreter>-<abi>-<platform>
+
+    The "none" ABI will be added if it was not explicitly provided.
+    """
+    if not interpreter:
+        interp_name = interpreter_name()
+        interp_version = interpreter_version(warn=warn)
+        interpreter = "".join([interp_name, interp_version])
+    if abis is None:
+        abis = _generic_abi()
+    else:
+        abis = list(abis)
+    platforms = list(platforms or platform_tags())
+    if "none" not in abis:
+        abis.append("none")
+    for abi in abis:
+        for platform_ in platforms:
+            yield Tag(interpreter, abi, platform_)
+
+
+def _py_interpreter_range(py_version: PythonVersion) -> Iterator[str]:
+    """
+    Yields Python versions in descending order.
+
+    After the latest version, the major-only version will be yielded, and then
+    all previous versions of that major version.
+    """
+    if len(py_version) > 1:
+        yield f"py{_version_nodot(py_version[:2])}"
+    yield f"py{py_version[0]}"
+    if len(py_version) > 1:
+        for minor in range(py_version[1] - 1, -1, -1):
+            yield f"py{_version_nodot((py_version[0], minor))}"
+
+
+def compatible_tags(
+    python_version: PythonVersion | None = None,
+    interpreter: str | None = None,
+    platforms: Iterable[str] | None = None,
+) -> Iterator[Tag]:
+    """
+    Yields the sequence of tags that are compatible with a specific version of Python.
+
+    The tags consist of:
+    - py*-none-<platform>
+    - <interpreter>-none-any  # ... if `interpreter` is provided.
+    - py*-none-any
+    """
+    if not python_version:
+        python_version = sys.version_info[:2]
+    platforms = list(platforms or platform_tags())
+    for version in _py_interpreter_range(python_version):
+        for platform_ in platforms:
+            yield Tag(version, "none", platform_)
+    if interpreter:
+        yield Tag(interpreter, "none", "any")
+    for version in _py_interpreter_range(python_version):
+        yield Tag(version, "none", "any")
+
+
+def _mac_arch(arch: str, is_32bit: bool = _32_BIT_INTERPRETER) -> str:
+    if not is_32bit:
+        return arch
+
+    if arch.startswith("ppc"):
+        return "ppc"
+
+    return "i386"
+
+
+def _mac_binary_formats(version: AppleVersion, cpu_arch: str) -> list[str]:
+    formats = [cpu_arch]
+    if cpu_arch == "x86_64":
+        if version < (10, 4):
+            return []
+        formats.extend(["intel", "fat64", "fat32"])
+
+    elif cpu_arch == "i386":
+        if version < (10, 4):
+            return []
+        formats.extend(["intel", "fat32", "fat"])
+
+    elif cpu_arch == "ppc64":
+        # TODO: Need to care about 32-bit PPC for ppc64 through 10.2?
+        if version > (10, 5) or version < (10, 4):
+            return []
+        formats.append("fat64")
+
+    elif cpu_arch == "ppc":
+        if version > (10, 6):
+            return []
+        formats.extend(["fat32", "fat"])
+
+    if cpu_arch in {"arm64", "x86_64"}:
+        formats.append("universal2")
+
+    if cpu_arch in {"x86_64", "i386", "ppc64", "ppc", "intel"}:
+        formats.append("universal")
+
+    return formats
+
+
+def mac_platforms(
+    version: AppleVersion | None = None, arch: str | None = None
+) -> Iterator[str]:
+    """
+    Yields the platform tags for a macOS system.
+
+    The `version` parameter is a two-item tuple specifying the macOS version to
+    generate platform tags for. The `arch` parameter is the CPU architecture to
+    generate platform tags for. Both parameters default to the appropriate value
+    for the current system.
+    """
+    version_str, _, cpu_arch = platform.mac_ver()
+    if version is None:
+        version = cast("AppleVersion", tuple(map(int, version_str.split(".")[:2])))
+        if version == (10, 16):
+            # When built against an older macOS SDK, Python will report macOS 10.16
+            # instead of the real version.
+            version_str = subprocess.run(
+                [
+                    sys.executable,
+                    "-sS",
+                    "-c",
+                    "import platform; print(platform.mac_ver()[0])",
+                ],
+                check=True,
+                env={"SYSTEM_VERSION_COMPAT": "0"},
+                stdout=subprocess.PIPE,
+                text=True,
+            ).stdout
+            version = cast("AppleVersion", tuple(map(int, version_str.split(".")[:2])))
+    else:
+        version = version
+    if arch is None:
+        arch = _mac_arch(cpu_arch)
+    else:
+        arch = arch
+
+    if (10, 0) <= version and version < (11, 0):
+        # Prior to Mac OS 11, each yearly release of Mac OS bumped the
+        # "minor" version number.  The major version was always 10.
+        major_version = 10
+        for minor_version in range(version[1], -1, -1):
+            compat_version = major_version, minor_version
+            binary_formats = _mac_binary_formats(compat_version, arch)
+            for binary_format in binary_formats:
+                yield f"macosx_{major_version}_{minor_version}_{binary_format}"
+
+    if version >= (11, 0):
+        # Starting with Mac OS 11, each yearly release bumps the major version
+        # number.   The minor versions are now the midyear updates.
+        minor_version = 0
+        for major_version in range(version[0], 10, -1):
+            compat_version = major_version, minor_version
+            binary_formats = _mac_binary_formats(compat_version, arch)
+            for binary_format in binary_formats:
+                yield f"macosx_{major_version}_{minor_version}_{binary_format}"
+
+    if version >= (11, 0):
+        # Mac OS 11 on x86_64 is compatible with binaries from previous releases.
+        # Arm64 support was introduced in 11.0, so no Arm binaries from previous
+        # releases exist.
+        #
+        # However, the "universal2" binary format can have a
+        # macOS version earlier than 11.0 when the x86_64 part of the binary supports
+        # that version of macOS.
+        major_version = 10
+        if arch == "x86_64":
+            for minor_version in range(16, 3, -1):
+                compat_version = major_version, minor_version
+                binary_formats = _mac_binary_formats(compat_version, arch)
+                for binary_format in binary_formats:
+                    yield f"macosx_{major_version}_{minor_version}_{binary_format}"
+        else:
+            for minor_version in range(16, 3, -1):
+                compat_version = major_version, minor_version
+                binary_format = "universal2"
+                yield f"macosx_{major_version}_{minor_version}_{binary_format}"
+
+
+def ios_platforms(
+    version: AppleVersion | None = None, multiarch: str | None = None
+) -> Iterator[str]:
+    """
+    Yields the platform tags for an iOS system.
+
+    :param version: A two-item tuple specifying the iOS version to generate
+        platform tags for. Defaults to the current iOS version.
+    :param multiarch: The CPU architecture+ABI to generate platform tags for -
+        (the value used by `sys.implementation._multiarch` e.g.,
+        `arm64_iphoneos` or `x84_64_iphonesimulator`). Defaults to the current
+        multiarch value.
+    """
+    if version is None:
+        # if iOS is the current platform, ios_ver *must* be defined. However,
+        # it won't exist for CPython versions before 3.13, which causes a mypy
+        # error.
+        _, release, _, _ = platform.ios_ver()  # type: ignore[attr-defined, unused-ignore]
+        version = cast("AppleVersion", tuple(map(int, release.split(".")[:2])))
+
+    if multiarch is None:
+        multiarch = sys.implementation._multiarch
+    multiarch = multiarch.replace("-", "_")
+
+    ios_platform_template = "ios_{major}_{minor}_{multiarch}"
+
+    # Consider any iOS major.minor version from the version requested, down to
+    # 12.0. 12.0 is the first iOS version that is known to have enough features
+    # to support CPython. Consider every possible minor release up to X.9. There
+    # highest the minor has ever gone is 8 (14.8 and 15.8) but having some extra
+    # candidates that won't ever match doesn't really hurt, and it saves us from
+    # having to keep an explicit list of known iOS versions in the code. Return
+    # the results descending order of version number.
+
+    # If the requested major version is less than 12, there won't be any matches.
+    if version[0] < 12:
+        return
+
+    # Consider the actual X.Y version that was requested.
+    yield ios_platform_template.format(
+        major=version[0], minor=version[1], multiarch=multiarch
+    )
+
+    # Consider every minor version from X.0 to the minor version prior to the
+    # version requested by the platform.
+    for minor in range(version[1] - 1, -1, -1):
+        yield ios_platform_template.format(
+            major=version[0], minor=minor, multiarch=multiarch
+        )
+
+    for major in range(version[0] - 1, 11, -1):
+        for minor in range(9, -1, -1):
+            yield ios_platform_template.format(
+                major=major, minor=minor, multiarch=multiarch
+            )
+
+
+def _linux_platforms(is_32bit: bool = _32_BIT_INTERPRETER) -> Iterator[str]:
+    linux = _normalize_string(sysconfig.get_platform())
+    if not linux.startswith("linux_"):
+        # we should never be here, just yield the sysconfig one and return
+        yield linux
+        return
+    if is_32bit:
+        if linux == "linux_x86_64":
+            linux = "linux_i686"
+        elif linux == "linux_aarch64":
+            linux = "linux_armv8l"
+    _, arch = linux.split("_", 1)
+    archs = {"armv8l": ["armv8l", "armv7l"]}.get(arch, [arch])
+    yield from _manylinux.platform_tags(archs)
+    yield from _musllinux.platform_tags(archs)
+    for arch in archs:
+        yield f"linux_{arch}"
+
+
+def _generic_platforms() -> Iterator[str]:
+    yield _normalize_string(sysconfig.get_platform())
+
+
+def platform_tags() -> Iterator[str]:
+    """
+    Provides the platform tags for this installation.
+    """
+    if platform.system() == "Darwin":
+        return mac_platforms()
+    elif platform.system() == "iOS":
+        return ios_platforms()
+    elif platform.system() == "Linux":
+        return _linux_platforms()
+    else:
+        return _generic_platforms()
+
+
+def interpreter_name() -> str:
+    """
+    Returns the name of the running interpreter.
+
+    Some implementations have a reserved, two-letter abbreviation which will
+    be returned when appropriate.
+    """
+    name = sys.implementation.name
+    return INTERPRETER_SHORT_NAMES.get(name) or name
+
+
+def interpreter_version(*, warn: bool = False) -> str:
+    """
+    Returns the version of the running interpreter.
+    """
+    version = _get_config_var("py_version_nodot", warn=warn)
+    if version:
+        version = str(version)
+    else:
+        version = _version_nodot(sys.version_info[:2])
+    return version
+
+
+def _version_nodot(version: PythonVersion) -> str:
+    return "".join(map(str, version))
+
+
+def sys_tags(*, warn: bool = False) -> Iterator[Tag]:
+    """
+    Returns the sequence of tag triples for the running interpreter.
+
+    The order of the sequence corresponds to priority order for the
+    interpreter, from most to least important.
+    """
+
+    interp_name = interpreter_name()
+    if interp_name == "cp":
+        yield from cpython_tags(warn=warn)
+    else:
+        yield from generic_tags()
+
+    if interp_name == "pp":
+        interp = "pp3"
+    elif interp_name == "cp":
+        interp = "cp" + interpreter_version(warn=warn)
+    else:
+        interp = None
+    yield from compatible_tags(interpreter=interp)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9bbda20e463b8d9389ecd65f74af33810a02bdd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py
@@ -0,0 +1,582 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+"""
+.. testsetup::
+
+    from packaging.version import parse, Version
+"""
+
+from __future__ import annotations
+
+import itertools
+import re
+from typing import Any, Callable, NamedTuple, SupportsInt, Tuple, Union
+
+from ._structures import Infinity, InfinityType, NegativeInfinity, NegativeInfinityType
+
+__all__ = ["VERSION_PATTERN", "InvalidVersion", "Version", "parse"]
+
+LocalType = Tuple[Union[int, str], ...]
+
+CmpPrePostDevType = Union[InfinityType, NegativeInfinityType, Tuple[str, int]]
+CmpLocalType = Union[
+    NegativeInfinityType,
+    Tuple[Union[Tuple[int, str], Tuple[NegativeInfinityType, Union[int, str]]], ...],
+]
+CmpKey = Tuple[
+    int,
+    Tuple[int, ...],
+    CmpPrePostDevType,
+    CmpPrePostDevType,
+    CmpPrePostDevType,
+    CmpLocalType,
+]
+VersionComparisonMethod = Callable[[CmpKey, CmpKey], bool]
+
+
+class _Version(NamedTuple):
+    epoch: int
+    release: tuple[int, ...]
+    dev: tuple[str, int] | None
+    pre: tuple[str, int] | None
+    post: tuple[str, int] | None
+    local: LocalType | None
+
+
+def parse(version: str) -> Version:
+    """Parse the given version string.
+
+    >>> parse('1.0.dev1')
+    <Version('1.0.dev1')>
+
+    :param version: The version string to parse.
+    :raises InvalidVersion: When the version string is not a valid version.
+    """
+    return Version(version)
+
+
+class InvalidVersion(ValueError):
+    """Raised when a version string is not a valid version.
+
+    >>> Version("invalid")
+    Traceback (most recent call last):
+        ...
+    packaging.version.InvalidVersion: Invalid version: 'invalid'
+    """
+
+
+class _BaseVersion:
+    _key: tuple[Any, ...]
+
+    def __hash__(self) -> int:
+        return hash(self._key)
+
+    # Please keep the duplicated `isinstance` check
+    # in the six comparisons hereunder
+    # unless you find a way to avoid adding overhead function calls.
+    def __lt__(self, other: _BaseVersion) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key < other._key
+
+    def __le__(self, other: _BaseVersion) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key <= other._key
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key == other._key
+
+    def __ge__(self, other: _BaseVersion) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key >= other._key
+
+    def __gt__(self, other: _BaseVersion) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key > other._key
+
+    def __ne__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key != other._key
+
+
+# Deliberately not anchored to the start and end of the string, to make it
+# easier for 3rd party code to reuse
+_VERSION_PATTERN = r"""
+    v?
+    (?:
+        (?:(?P<epoch>[0-9]+)!)?                           # epoch
+        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
+        (?P<pre>                                          # pre-release
+            [-_\.]?
+            (?P<pre_l>alpha|a|beta|b|preview|pre|c|rc)
+            [-_\.]?
+            (?P<pre_n>[0-9]+)?
+        )?
+        (?P<post>                                         # post release
+            (?:-(?P<post_n1>[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?P<post_l>post|rev|r)
+                [-_\.]?
+                (?P<post_n2>[0-9]+)?
+            )
+        )?
+        (?P<dev>                                          # dev release
+            [-_\.]?
+            (?P<dev_l>dev)
+            [-_\.]?
+            (?P<dev_n>[0-9]+)?
+        )?
+    )
+    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+VERSION_PATTERN = _VERSION_PATTERN
+"""
+A string containing the regular expression used to match a valid version.
+
+The pattern is not anchored at either end, and is intended for embedding in larger
+expressions (for example, matching a version number as part of a file name). The
+regular expression should be compiled with the ``re.VERBOSE`` and ``re.IGNORECASE``
+flags set.
+
+:meta hide-value:
+"""
+
+
+class Version(_BaseVersion):
+    """This class abstracts handling of a project's versions.
+
+    A :class:`Version` instance is comparison aware and can be compared and
+    sorted using the standard Python interfaces.
+
+    >>> v1 = Version("1.0a5")
+    >>> v2 = Version("1.0")
+    >>> v1
+    <Version('1.0a5')>
+    >>> v2
+    <Version('1.0')>
+    >>> v1 < v2
+    True
+    >>> v1 == v2
+    False
+    >>> v1 > v2
+    False
+    >>> v1 >= v2
+    False
+    >>> v1 <= v2
+    True
+    """
+
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+    _key: CmpKey
+
+    def __init__(self, version: str) -> None:
+        """Initialize a Version object.
+
+        :param version:
+            The string representation of a version which will be parsed and normalized
+            before use.
+        :raises InvalidVersion:
+            If the ``version`` does not conform to PEP 440 in any way then this
+            exception will be raised.
+        """
+
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: {version!r}")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        """A representation of the Version that shows all internal state.
+
+        >>> Version('1.0.0')
+        <Version('1.0.0')>
+        """
+        return f"<Version('{self}')>"
+
+    def __str__(self) -> str:
+        """A string representation of the version that can be round-tripped.
+
+        >>> str(Version("1.0a5"))
+        '1.0a5'
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        """The epoch of the version.
+
+        >>> Version("2.0.0").epoch
+        0
+        >>> Version("1!2.0.0").epoch
+        1
+        """
+        return self._version.epoch
+
+    @property
+    def release(self) -> tuple[int, ...]:
+        """The components of the "release" segment of the version.
+
+        >>> Version("1.2.3").release
+        (1, 2, 3)
+        >>> Version("2.0.0").release
+        (2, 0, 0)
+        >>> Version("1!2.0.0.post0").release
+        (2, 0, 0)
+
+        Includes trailing zeroes but not the epoch or any pre-release / development /
+        post-release suffixes.
+        """
+        return self._version.release
+
+    @property
+    def pre(self) -> tuple[str, int] | None:
+        """The pre-release segment of the version.
+
+        >>> print(Version("1.2.3").pre)
+        None
+        >>> Version("1.2.3a1").pre
+        ('a', 1)
+        >>> Version("1.2.3b1").pre
+        ('b', 1)
+        >>> Version("1.2.3rc1").pre
+        ('rc', 1)
+        """
+        return self._version.pre
+
+    @property
+    def post(self) -> int | None:
+        """The post-release number of the version.
+
+        >>> print(Version("1.2.3").post)
+        None
+        >>> Version("1.2.3.post1").post
+        1
+        """
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> int | None:
+        """The development number of the version.
+
+        >>> print(Version("1.2.3").dev)
+        None
+        >>> Version("1.2.3.dev1").dev
+        1
+        """
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> str | None:
+        """The local version segment of the version.
+
+        >>> print(Version("1.2.3").local)
+        None
+        >>> Version("1.2.3+abc").local
+        'abc'
+        """
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        """The public portion of the version.
+
+        >>> Version("1.2.3").public
+        '1.2.3'
+        >>> Version("1.2.3+abc").public
+        '1.2.3'
+        >>> Version("1!1.2.3dev1+abc").public
+        '1!1.2.3.dev1'
+        """
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        """The "base version" of the version.
+
+        >>> Version("1.2.3").base_version
+        '1.2.3'
+        >>> Version("1.2.3+abc").base_version
+        '1.2.3'
+        >>> Version("1!1.2.3dev1+abc").base_version
+        '1!1.2.3'
+
+        The "base version" is the public version of the project without any pre or post
+        release markers.
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        """Whether this version is a pre-release.
+
+        >>> Version("1.2.3").is_prerelease
+        False
+        >>> Version("1.2.3a1").is_prerelease
+        True
+        >>> Version("1.2.3b1").is_prerelease
+        True
+        >>> Version("1.2.3rc1").is_prerelease
+        True
+        >>> Version("1.2.3dev1").is_prerelease
+        True
+        """
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        """Whether this version is a post-release.
+
+        >>> Version("1.2.3").is_postrelease
+        False
+        >>> Version("1.2.3.post1").is_postrelease
+        True
+        """
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        """Whether this version is a development release.
+
+        >>> Version("1.2.3").is_devrelease
+        False
+        >>> Version("1.2.3.dev1").is_devrelease
+        True
+        """
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        """The first item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").major
+        1
+        """
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        """The second item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").minor
+        2
+        >>> Version("1").minor
+        0
+        """
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        """The third item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").micro
+        3
+        >>> Version("1").micro
+        0
+        """
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+class _TrimmedRelease(Version):
+    @property
+    def release(self) -> tuple[int, ...]:
+        """
+        Release segment without any trailing zeros.
+
+        >>> _TrimmedRelease('1.0.0').release
+        (1,)
+        >>> _TrimmedRelease('0.0').release
+        (0,)
+        """
+        rel = super().release
+        nonzeros = (index for index, val in enumerate(rel) if val)
+        last_nonzero = max(nonzeros, default=0)
+        return rel[: last_nonzero + 1]
+
+
+def _parse_letter_version(
+    letter: str | None, number: str | bytes | SupportsInt | None
+) -> tuple[str, int] | None:
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+
+    assert not letter
+    if number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str | None) -> LocalType | None:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: tuple[int, ...],
+    pre: tuple[str, int] | None,
+    post: tuple[str, int] | None,
+    dev: tuple[str, int] | None,
+    local: LocalType | None,
+) -> CmpKey:
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: CmpPrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: CmpPrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: CmpPrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: CmpLocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34b0fcbd409601aab6b579189824a230dabf25c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""CacheControl import Interface.
+
+Make it easy to import from cachecontrol without long namespaces.
+"""
+__author__ = "Eric Larson"
+__email__ = "eric@ionrock.org"
+__version__ = "0.14.0"
+
+from pip._vendor.cachecontrol.adapter import CacheControlAdapter
+from pip._vendor.cachecontrol.controller import CacheController
+from pip._vendor.cachecontrol.wrapper import CacheControl
+
+__all__ = [
+    "__author__",
+    "__email__",
+    "__version__",
+    "CacheControlAdapter",
+    "CacheController",
+    "CacheControl",
+]
+
+import logging
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e682aec58bd50f95bec399fa8485bba460be45e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96fb8f4ddb22cd34ea48c2139b54fe0e5f2e5d65
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32767184aa03a9fe6c49d453230e6905edb4b5a4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c5192ee4ee7d4b6d871f48b958a625a06b4270
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05a187db3536803ea8dd6edbdf7e97925b8cce20
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eff36a9ed3f700f34844bd6bb0b139c98c5cc43
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e5634e38559efcb0c61e266f73ff7e8d0b1ad9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import calendar
+import time
+from datetime import datetime, timedelta, timezone
+from email.utils import formatdate, parsedate, parsedate_tz
+from typing import TYPE_CHECKING, Any, Mapping
+
+if TYPE_CHECKING:
+    from pip._vendor.urllib3 import HTTPResponse
+
+TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"
+
+
+def expire_after(delta: timedelta, date: datetime | None = None) -> datetime:
+    date = date or datetime.now(timezone.utc)
+    return date + delta
+
+
+def datetime_to_header(dt: datetime) -> str:
+    return formatdate(calendar.timegm(dt.timetuple()))
+
+
+class BaseHeuristic:
+    def warning(self, response: HTTPResponse) -> str | None:
+        """
+        Return a valid 1xx warning header value describing the cache
+        adjustments.
+
+        The response is provided too allow warnings like 113
+        http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need
+        to explicitly say response is over 24 hours old.
+        """
+        return '110 - "Response is Stale"'
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        """Update the response headers with any new headers.
+
+        NOTE: This SHOULD always include some Warning header to
+              signify that the response was cached by the client, not
+              by way of the provided headers.
+        """
+        return {}
+
+    def apply(self, response: HTTPResponse) -> HTTPResponse:
+        updated_headers = self.update_headers(response)
+
+        if updated_headers:
+            response.headers.update(updated_headers)
+            warning_header_value = self.warning(response)
+            if warning_header_value is not None:
+                response.headers.update({"Warning": warning_header_value})
+
+        return response
+
+
+class OneDayCache(BaseHeuristic):
+    """
+    Cache the response by providing an expires 1 day in the
+    future.
+    """
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        headers = {}
+
+        if "expires" not in response.headers:
+            date = parsedate(response.headers["date"])
+            expires = expire_after(timedelta(days=1), date=datetime(*date[:6], tzinfo=timezone.utc))  # type: ignore[index,misc]
+            headers["expires"] = datetime_to_header(expires)
+            headers["cache-control"] = "public"
+        return headers
+
+
+class ExpiresAfter(BaseHeuristic):
+    """
+    Cache **all** requests for a defined time period.
+    """
+
+    def __init__(self, **kw: Any) -> None:
+        self.delta = timedelta(**kw)
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        expires = expire_after(self.delta)
+        return {"expires": datetime_to_header(expires), "cache-control": "public"}
+
+    def warning(self, response: HTTPResponse) -> str | None:
+        tmpl = "110 - Automatically cached for %s. Response might be stale"
+        return tmpl % self.delta
+
+
+class LastModified(BaseHeuristic):
+    """
+    If there is no Expires header already, fall back on Last-Modified
+    using the heuristic from
+    http://tools.ietf.org/html/rfc7234#section-4.2.2
+    to calculate a reasonable value.
+
+    Firefox also does something like this per
+    https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ
+    http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397
+    Unlike mozilla we limit this to 24-hr.
+    """
+
+    cacheable_by_default_statuses = {
+        200,
+        203,
+        204,
+        206,
+        300,
+        301,
+        404,
+        405,
+        410,
+        414,
+        501,
+    }
+
+    def update_headers(self, resp: HTTPResponse) -> dict[str, str]:
+        headers: Mapping[str, str] = resp.headers
+
+        if "expires" in headers:
+            return {}
+
+        if "cache-control" in headers and headers["cache-control"] != "public":
+            return {}
+
+        if resp.status not in self.cacheable_by_default_statuses:
+            return {}
+
+        if "date" not in headers or "last-modified" not in headers:
+            return {}
+
+        time_tuple = parsedate_tz(headers["date"])
+        assert time_tuple is not None
+        date = calendar.timegm(time_tuple[:6])
+        last_modified = parsedate(headers["last-modified"])
+        if last_modified is None:
+            return {}
+
+        now = time.time()
+        current_age = max(0, now - date)
+        delta = date - calendar.timegm(last_modified)
+        freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
+        if freshness_lifetime <= current_age:
+            return {}
+
+        expires = date + freshness_lifetime
+        return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))}
+
+    def warning(self, resp: HTTPResponse) -> str | None:
+        return None
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py
new file mode 100644
index 0000000000000000000000000000000000000000..30446ceb3f0235721e435f5fbd53f2e306f078cd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py
@@ -0,0 +1,270 @@
+from __future__ import absolute_import
+
+import inspect
+from inspect import cleandoc, getdoc, getfile, isclass, ismodule, signature
+from typing import Any, Collection, Iterable, Optional, Tuple, Type, Union
+
+from .console import Group, RenderableType
+from .control import escape_control_codes
+from .highlighter import ReprHighlighter
+from .jupyter import JupyterMixin
+from .panel import Panel
+from .pretty import Pretty
+from .table import Table
+from .text import Text, TextType
+
+
+def _first_paragraph(doc: str) -> str:
+    """Get the first paragraph from a docstring."""
+    paragraph, _, _ = doc.partition("\n\n")
+    return paragraph
+
+
+class Inspect(JupyterMixin):
+    """A renderable to inspect any Python Object.
+
+    Args:
+        obj (Any): An object to inspect.
+        title (str, optional): Title to display over inspect result, or None use type. Defaults to None.
+        help (bool, optional): Show full help text rather than just first paragraph. Defaults to False.
+        methods (bool, optional): Enable inspection of callables. Defaults to False.
+        docs (bool, optional): Also render doc strings. Defaults to True.
+        private (bool, optional): Show private attributes (beginning with underscore). Defaults to False.
+        dunder (bool, optional): Show attributes starting with double underscore. Defaults to False.
+        sort (bool, optional): Sort attributes alphabetically. Defaults to True.
+        all (bool, optional): Show all attributes. Defaults to False.
+        value (bool, optional): Pretty print value of object. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        obj: Any,
+        *,
+        title: Optional[TextType] = None,
+        help: bool = False,
+        methods: bool = False,
+        docs: bool = True,
+        private: bool = False,
+        dunder: bool = False,
+        sort: bool = True,
+        all: bool = True,
+        value: bool = True,
+    ) -> None:
+        self.highlighter = ReprHighlighter()
+        self.obj = obj
+        self.title = title or self._make_title(obj)
+        if all:
+            methods = private = dunder = True
+        self.help = help
+        self.methods = methods
+        self.docs = docs or help
+        self.private = private or dunder
+        self.dunder = dunder
+        self.sort = sort
+        self.value = value
+
+    def _make_title(self, obj: Any) -> Text:
+        """Make a default title."""
+        title_str = (
+            str(obj)
+            if (isclass(obj) or callable(obj) or ismodule(obj))
+            else str(type(obj))
+        )
+        title_text = self.highlighter(title_str)
+        return title_text
+
+    def __rich__(self) -> Panel:
+        return Panel.fit(
+            Group(*self._render()),
+            title=self.title,
+            border_style="scope.border",
+            padding=(0, 1),
+        )
+
+    def _get_signature(self, name: str, obj: Any) -> Optional[Text]:
+        """Get a signature for a callable."""
+        try:
+            _signature = str(signature(obj)) + ":"
+        except ValueError:
+            _signature = "(...)"
+        except TypeError:
+            return None
+
+        source_filename: Optional[str] = None
+        try:
+            source_filename = getfile(obj)
+        except (OSError, TypeError):
+            # OSError is raised if obj has no source file, e.g. when defined in REPL.
+            pass
+
+        callable_name = Text(name, style="inspect.callable")
+        if source_filename:
+            callable_name.stylize(f"link file://{source_filename}")
+        signature_text = self.highlighter(_signature)
+
+        qualname = name or getattr(obj, "__qualname__", name)
+
+        # If obj is a module, there may be classes (which are callable) to display
+        if inspect.isclass(obj):
+            prefix = "class"
+        elif inspect.iscoroutinefunction(obj):
+            prefix = "async def"
+        else:
+            prefix = "def"
+
+        qual_signature = Text.assemble(
+            (f"{prefix} ", f"inspect.{prefix.replace(' ', '_')}"),
+            (qualname, "inspect.callable"),
+            signature_text,
+        )
+
+        return qual_signature
+
+    def _render(self) -> Iterable[RenderableType]:
+        """Render object."""
+
+        def sort_items(item: Tuple[str, Any]) -> Tuple[bool, str]:
+            key, (_error, value) = item
+            return (callable(value), key.strip("_").lower())
+
+        def safe_getattr(attr_name: str) -> Tuple[Any, Any]:
+            """Get attribute or any exception."""
+            try:
+                return (None, getattr(obj, attr_name))
+            except Exception as error:
+                return (error, None)
+
+        obj = self.obj
+        keys = dir(obj)
+        total_items = len(keys)
+        if not self.dunder:
+            keys = [key for key in keys if not key.startswith("__")]
+        if not self.private:
+            keys = [key for key in keys if not key.startswith("_")]
+        not_shown_count = total_items - len(keys)
+        items = [(key, safe_getattr(key)) for key in keys]
+        if self.sort:
+            items.sort(key=sort_items)
+
+        items_table = Table.grid(padding=(0, 1), expand=False)
+        items_table.add_column(justify="right")
+        add_row = items_table.add_row
+        highlighter = self.highlighter
+
+        if callable(obj):
+            signature = self._get_signature("", obj)
+            if signature is not None:
+                yield signature
+                yield ""
+
+        if self.docs:
+            _doc = self._get_formatted_doc(obj)
+            if _doc is not None:
+                doc_text = Text(_doc, style="inspect.help")
+                doc_text = highlighter(doc_text)
+                yield doc_text
+                yield ""
+
+        if self.value and not (isclass(obj) or callable(obj) or ismodule(obj)):
+            yield Panel(
+                Pretty(obj, indent_guides=True, max_length=10, max_string=60),
+                border_style="inspect.value.border",
+            )
+            yield ""
+
+        for key, (error, value) in items:
+            key_text = Text.assemble(
+                (
+                    key,
+                    "inspect.attr.dunder" if key.startswith("__") else "inspect.attr",
+                ),
+                (" =", "inspect.equals"),
+            )
+            if error is not None:
+                warning = key_text.copy()
+                warning.stylize("inspect.error")
+                add_row(warning, highlighter(repr(error)))
+                continue
+
+            if callable(value):
+                if not self.methods:
+                    continue
+
+                _signature_text = self._get_signature(key, value)
+                if _signature_text is None:
+                    add_row(key_text, Pretty(value, highlighter=highlighter))
+                else:
+                    if self.docs:
+                        docs = self._get_formatted_doc(value)
+                        if docs is not None:
+                            _signature_text.append("\n" if "\n" in docs else " ")
+                            doc = highlighter(docs)
+                            doc.stylize("inspect.doc")
+                            _signature_text.append(doc)
+
+                    add_row(key_text, _signature_text)
+            else:
+                add_row(key_text, Pretty(value, highlighter=highlighter))
+        if items_table.row_count:
+            yield items_table
+        elif not_shown_count:
+            yield Text.from_markup(
+                f"[b cyan]{not_shown_count}[/][i] attribute(s) not shown.[/i] "
+                f"Run [b][magenta]inspect[/]([not b]inspect[/])[/b] for options."
+            )
+
+    def _get_formatted_doc(self, object_: Any) -> Optional[str]:
+        """
+        Extract the docstring of an object, process it and returns it.
+        The processing consists in cleaning up the doctring's indentation,
+        taking only its 1st paragraph if `self.help` is not True,
+        and escape its control codes.
+
+        Args:
+            object_ (Any): the object to get the docstring from.
+
+        Returns:
+            Optional[str]: the processed docstring, or None if no docstring was found.
+        """
+        docs = getdoc(object_)
+        if docs is None:
+            return None
+        docs = cleandoc(docs).strip()
+        if not self.help:
+            docs = _first_paragraph(docs)
+        return escape_control_codes(docs)
+
+
+def get_object_types_mro(obj: Union[object, Type[Any]]) -> Tuple[type, ...]:
+    """Returns the MRO of an object's class, or of the object itself if it's a class."""
+    if not hasattr(obj, "__mro__"):
+        # N.B. we cannot use `if type(obj) is type` here because it doesn't work with
+        # some types of classes, such as the ones that use abc.ABCMeta.
+        obj = type(obj)
+    return getattr(obj, "__mro__", ())
+
+
+def get_object_types_mro_as_strings(obj: object) -> Collection[str]:
+    """
+    Returns the MRO of an object's class as full qualified names, or of the object itself if it's a class.
+
+    Examples:
+        `object_types_mro_as_strings(JSONDecoder)` will return `['json.decoder.JSONDecoder', 'builtins.object']`
+    """
+    return [
+        f'{getattr(type_, "__module__", "")}.{getattr(type_, "__qualname__", "")}'
+        for type_ in get_object_types_mro(obj)
+    ]
+
+
+def is_object_one_of_types(
+    obj: object, fully_qualified_types_names: Collection[str]
+) -> bool:
+    """
+    Returns `True` if the given object's class (or the object itself, if it's a class) has one of the
+    fully qualified names in its MRO.
+    """
+    for type_name in get_object_types_mro_as_strings(obj):
+        if type_name in fully_qualified_types_names:
+            return True
+    return False
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc16c84437a8a34231c44d3f0a331459ddcb0f34
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py
@@ -0,0 +1,94 @@
+from datetime import datetime
+from typing import Iterable, List, Optional, TYPE_CHECKING, Union, Callable
+
+
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import Console, ConsoleRenderable, RenderableType
+    from .table import Table
+
+FormatTimeCallable = Callable[[datetime], Text]
+
+
+class LogRender:
+    def __init__(
+        self,
+        show_time: bool = True,
+        show_level: bool = False,
+        show_path: bool = True,
+        time_format: Union[str, FormatTimeCallable] = "[%x %X]",
+        omit_repeated_times: bool = True,
+        level_width: Optional[int] = 8,
+    ) -> None:
+        self.show_time = show_time
+        self.show_level = show_level
+        self.show_path = show_path
+        self.time_format = time_format
+        self.omit_repeated_times = omit_repeated_times
+        self.level_width = level_width
+        self._last_time: Optional[Text] = None
+
+    def __call__(
+        self,
+        console: "Console",
+        renderables: Iterable["ConsoleRenderable"],
+        log_time: Optional[datetime] = None,
+        time_format: Optional[Union[str, FormatTimeCallable]] = None,
+        level: TextType = "",
+        path: Optional[str] = None,
+        line_no: Optional[int] = None,
+        link_path: Optional[str] = None,
+    ) -> "Table":
+        from .containers import Renderables
+        from .table import Table
+
+        output = Table.grid(padding=(0, 1))
+        output.expand = True
+        if self.show_time:
+            output.add_column(style="log.time")
+        if self.show_level:
+            output.add_column(style="log.level", width=self.level_width)
+        output.add_column(ratio=1, style="log.message", overflow="fold")
+        if self.show_path and path:
+            output.add_column(style="log.path")
+        row: List["RenderableType"] = []
+        if self.show_time:
+            log_time = log_time or console.get_datetime()
+            time_format = time_format or self.time_format
+            if callable(time_format):
+                log_time_display = time_format(log_time)
+            else:
+                log_time_display = Text(log_time.strftime(time_format))
+            if log_time_display == self._last_time and self.omit_repeated_times:
+                row.append(Text(" " * len(log_time_display)))
+            else:
+                row.append(log_time_display)
+                self._last_time = log_time_display
+        if self.show_level:
+            row.append(level)
+
+        row.append(Renderables(renderables))
+        if self.show_path and path:
+            path_text = Text()
+            path_text.append(
+                path, style=f"link file://{link_path}" if link_path else ""
+            )
+            if line_no:
+                path_text.append(":")
+                path_text.append(
+                    f"{line_no}",
+                    style=f"link file://{link_path}#{line_no}" if link_path else "",
+                )
+            row.append(path_text)
+
+        output.add_row(*row)
+        return output
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Console
+
+    c = Console()
+    c.print("[on blue]Hello", justify="right")
+    c.log("[on blue]hello", justify="right")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b1082905338a74b72b9de432ece50a456687bc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py
@@ -0,0 +1,662 @@
+"""Light wrapper around the Win32 Console API - this module should only be imported on Windows
+
+The API that this module wraps is documented at https://docs.microsoft.com/en-us/windows/console/console-functions
+"""
+import ctypes
+import sys
+from typing import Any
+
+windll: Any = None
+if sys.platform == "win32":
+    windll = ctypes.LibraryLoader(ctypes.WinDLL)
+else:
+    raise ImportError(f"{__name__} can only be imported on Windows")
+
+import time
+from ctypes import Structure, byref, wintypes
+from typing import IO, NamedTuple, Type, cast
+
+from pip._vendor.rich.color import ColorSystem
+from pip._vendor.rich.style import Style
+
+STDOUT = -11
+ENABLE_VIRTUAL_TERMINAL_PROCESSING = 4
+
+COORD = wintypes._COORD
+
+
+class LegacyWindowsError(Exception):
+    pass
+
+
+class WindowsCoordinates(NamedTuple):
+    """Coordinates in the Windows Console API are (y, x), not (x, y).
+    This class is intended to prevent that confusion.
+    Rows and columns are indexed from 0.
+    This class can be used in place of wintypes._COORD in arguments and argtypes.
+    """
+
+    row: int
+    col: int
+
+    @classmethod
+    def from_param(cls, value: "WindowsCoordinates") -> COORD:
+        """Converts a WindowsCoordinates into a wintypes _COORD structure.
+        This classmethod is internally called by ctypes to perform the conversion.
+
+        Args:
+            value (WindowsCoordinates): The input coordinates to convert.
+
+        Returns:
+            wintypes._COORD: The converted coordinates struct.
+        """
+        return COORD(value.col, value.row)
+
+
+class CONSOLE_SCREEN_BUFFER_INFO(Structure):
+    _fields_ = [
+        ("dwSize", COORD),
+        ("dwCursorPosition", COORD),
+        ("wAttributes", wintypes.WORD),
+        ("srWindow", wintypes.SMALL_RECT),
+        ("dwMaximumWindowSize", COORD),
+    ]
+
+
+class CONSOLE_CURSOR_INFO(ctypes.Structure):
+    _fields_ = [("dwSize", wintypes.DWORD), ("bVisible", wintypes.BOOL)]
+
+
+_GetStdHandle = windll.kernel32.GetStdHandle
+_GetStdHandle.argtypes = [
+    wintypes.DWORD,
+]
+_GetStdHandle.restype = wintypes.HANDLE
+
+
+def GetStdHandle(handle: int = STDOUT) -> wintypes.HANDLE:
+    """Retrieves a handle to the specified standard device (standard input, standard output, or standard error).
+
+    Args:
+        handle (int): Integer identifier for the handle. Defaults to -11 (stdout).
+
+    Returns:
+        wintypes.HANDLE: The handle
+    """
+    return cast(wintypes.HANDLE, _GetStdHandle(handle))
+
+
+_GetConsoleMode = windll.kernel32.GetConsoleMode
+_GetConsoleMode.argtypes = [wintypes.HANDLE, wintypes.LPDWORD]
+_GetConsoleMode.restype = wintypes.BOOL
+
+
+def GetConsoleMode(std_handle: wintypes.HANDLE) -> int:
+    """Retrieves the current input mode of a console's input buffer
+    or the current output mode of a console screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+
+    Raises:
+        LegacyWindowsError: If any error occurs while calling the Windows console API.
+
+    Returns:
+        int: Value representing the current console mode as documented at
+            https://docs.microsoft.com/en-us/windows/console/getconsolemode#parameters
+    """
+
+    console_mode = wintypes.DWORD()
+    success = bool(_GetConsoleMode(std_handle, console_mode))
+    if not success:
+        raise LegacyWindowsError("Unable to get legacy Windows Console Mode")
+    return console_mode.value
+
+
+_FillConsoleOutputCharacterW = windll.kernel32.FillConsoleOutputCharacterW
+_FillConsoleOutputCharacterW.argtypes = [
+    wintypes.HANDLE,
+    ctypes.c_char,
+    wintypes.DWORD,
+    cast(Type[COORD], WindowsCoordinates),
+    ctypes.POINTER(wintypes.DWORD),
+]
+_FillConsoleOutputCharacterW.restype = wintypes.BOOL
+
+
+def FillConsoleOutputCharacter(
+    std_handle: wintypes.HANDLE,
+    char: str,
+    length: int,
+    start: WindowsCoordinates,
+) -> int:
+    """Writes a character to the console screen buffer a specified number of times, beginning at the specified coordinates.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        char (str): The character to write. Must be a string of length 1.
+        length (int): The number of times to write the character.
+        start (WindowsCoordinates): The coordinates to start writing at.
+
+    Returns:
+        int: The number of characters written.
+    """
+    character = ctypes.c_char(char.encode())
+    num_characters = wintypes.DWORD(length)
+    num_written = wintypes.DWORD(0)
+    _FillConsoleOutputCharacterW(
+        std_handle,
+        character,
+        num_characters,
+        start,
+        byref(num_written),
+    )
+    return num_written.value
+
+
+_FillConsoleOutputAttribute = windll.kernel32.FillConsoleOutputAttribute
+_FillConsoleOutputAttribute.argtypes = [
+    wintypes.HANDLE,
+    wintypes.WORD,
+    wintypes.DWORD,
+    cast(Type[COORD], WindowsCoordinates),
+    ctypes.POINTER(wintypes.DWORD),
+]
+_FillConsoleOutputAttribute.restype = wintypes.BOOL
+
+
+def FillConsoleOutputAttribute(
+    std_handle: wintypes.HANDLE,
+    attributes: int,
+    length: int,
+    start: WindowsCoordinates,
+) -> int:
+    """Sets the character attributes for a specified number of character cells,
+    beginning at the specified coordinates in a screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        attributes (int): Integer value representing the foreground and background colours of the cells.
+        length (int): The number of cells to set the output attribute of.
+        start (WindowsCoordinates): The coordinates of the first cell whose attributes are to be set.
+
+    Returns:
+        int: The number of cells whose attributes were actually set.
+    """
+    num_cells = wintypes.DWORD(length)
+    style_attrs = wintypes.WORD(attributes)
+    num_written = wintypes.DWORD(0)
+    _FillConsoleOutputAttribute(
+        std_handle, style_attrs, num_cells, start, byref(num_written)
+    )
+    return num_written.value
+
+
+_SetConsoleTextAttribute = windll.kernel32.SetConsoleTextAttribute
+_SetConsoleTextAttribute.argtypes = [
+    wintypes.HANDLE,
+    wintypes.WORD,
+]
+_SetConsoleTextAttribute.restype = wintypes.BOOL
+
+
+def SetConsoleTextAttribute(
+    std_handle: wintypes.HANDLE, attributes: wintypes.WORD
+) -> bool:
+    """Set the colour attributes for all text written after this function is called.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        attributes (int): Integer value representing the foreground and background colours.
+
+
+    Returns:
+        bool: True if the attribute was set successfully, otherwise False.
+    """
+    return bool(_SetConsoleTextAttribute(std_handle, attributes))
+
+
+_GetConsoleScreenBufferInfo = windll.kernel32.GetConsoleScreenBufferInfo
+_GetConsoleScreenBufferInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_SCREEN_BUFFER_INFO),
+]
+_GetConsoleScreenBufferInfo.restype = wintypes.BOOL
+
+
+def GetConsoleScreenBufferInfo(
+    std_handle: wintypes.HANDLE,
+) -> CONSOLE_SCREEN_BUFFER_INFO:
+    """Retrieves information about the specified console screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+
+    Returns:
+        CONSOLE_SCREEN_BUFFER_INFO: A CONSOLE_SCREEN_BUFFER_INFO ctype struct contain information about
+            screen size, cursor position, colour attributes, and more."""
+    console_screen_buffer_info = CONSOLE_SCREEN_BUFFER_INFO()
+    _GetConsoleScreenBufferInfo(std_handle, byref(console_screen_buffer_info))
+    return console_screen_buffer_info
+
+
+_SetConsoleCursorPosition = windll.kernel32.SetConsoleCursorPosition
+_SetConsoleCursorPosition.argtypes = [
+    wintypes.HANDLE,
+    cast(Type[COORD], WindowsCoordinates),
+]
+_SetConsoleCursorPosition.restype = wintypes.BOOL
+
+
+def SetConsoleCursorPosition(
+    std_handle: wintypes.HANDLE, coords: WindowsCoordinates
+) -> bool:
+    """Set the position of the cursor in the console screen
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        coords (WindowsCoordinates): The coordinates to move the cursor to.
+
+    Returns:
+        bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleCursorPosition(std_handle, coords))
+
+
+_GetConsoleCursorInfo = windll.kernel32.GetConsoleCursorInfo
+_GetConsoleCursorInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_CURSOR_INFO),
+]
+_GetConsoleCursorInfo.restype = wintypes.BOOL
+
+
+def GetConsoleCursorInfo(
+    std_handle: wintypes.HANDLE, cursor_info: CONSOLE_CURSOR_INFO
+) -> bool:
+    """Get the cursor info - used to get cursor visibility and width
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        cursor_info (CONSOLE_CURSOR_INFO): CONSOLE_CURSOR_INFO ctype struct that receives information
+            about the console's cursor.
+
+    Returns:
+          bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_GetConsoleCursorInfo(std_handle, byref(cursor_info)))
+
+
+_SetConsoleCursorInfo = windll.kernel32.SetConsoleCursorInfo
+_SetConsoleCursorInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_CURSOR_INFO),
+]
+_SetConsoleCursorInfo.restype = wintypes.BOOL
+
+
+def SetConsoleCursorInfo(
+    std_handle: wintypes.HANDLE, cursor_info: CONSOLE_CURSOR_INFO
+) -> bool:
+    """Set the cursor info - used for adjusting cursor visibility and width
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        cursor_info (CONSOLE_CURSOR_INFO): CONSOLE_CURSOR_INFO ctype struct containing the new cursor info.
+
+    Returns:
+          bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleCursorInfo(std_handle, byref(cursor_info)))
+
+
+_SetConsoleTitle = windll.kernel32.SetConsoleTitleW
+_SetConsoleTitle.argtypes = [wintypes.LPCWSTR]
+_SetConsoleTitle.restype = wintypes.BOOL
+
+
+def SetConsoleTitle(title: str) -> bool:
+    """Sets the title of the current console window
+
+    Args:
+        title (str): The new title of the console window.
+
+    Returns:
+        bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleTitle(title))
+
+
+class LegacyWindowsTerm:
+    """This class allows interaction with the legacy Windows Console API. It should only be used in the context
+    of environments where virtual terminal processing is not available. However, if it is used in a Windows environment,
+    the entire API should work.
+
+    Args:
+        file (IO[str]): The file which the Windows Console API HANDLE is retrieved from, defaults to sys.stdout.
+    """
+
+    BRIGHT_BIT = 8
+
+    # Indices are ANSI color numbers, values are the corresponding Windows Console API color numbers
+    ANSI_TO_WINDOWS = [
+        0,  # black                      The Windows colours are defined in wincon.h as follows:
+        4,  # red                         define FOREGROUND_BLUE            0x0001 -- 0000 0001
+        2,  # green                       define FOREGROUND_GREEN           0x0002 -- 0000 0010
+        6,  # yellow                      define FOREGROUND_RED             0x0004 -- 0000 0100
+        1,  # blue                        define FOREGROUND_INTENSITY       0x0008 -- 0000 1000
+        5,  # magenta                     define BACKGROUND_BLUE            0x0010 -- 0001 0000
+        3,  # cyan                        define BACKGROUND_GREEN           0x0020 -- 0010 0000
+        7,  # white                       define BACKGROUND_RED             0x0040 -- 0100 0000
+        8,  # bright black (grey)         define BACKGROUND_INTENSITY       0x0080 -- 1000 0000
+        12,  # bright red
+        10,  # bright green
+        14,  # bright yellow
+        9,  # bright blue
+        13,  # bright magenta
+        11,  # bright cyan
+        15,  # bright white
+    ]
+
+    def __init__(self, file: "IO[str]") -> None:
+        handle = GetStdHandle(STDOUT)
+        self._handle = handle
+        default_text = GetConsoleScreenBufferInfo(handle).wAttributes
+        self._default_text = default_text
+
+        self._default_fore = default_text & 7
+        self._default_back = (default_text >> 4) & 7
+        self._default_attrs = self._default_fore | (self._default_back << 4)
+
+        self._file = file
+        self.write = file.write
+        self.flush = file.flush
+
+    @property
+    def cursor_position(self) -> WindowsCoordinates:
+        """Returns the current position of the cursor (0-based)
+
+        Returns:
+            WindowsCoordinates: The current cursor position.
+        """
+        coord: COORD = GetConsoleScreenBufferInfo(self._handle).dwCursorPosition
+        return WindowsCoordinates(row=cast(int, coord.Y), col=cast(int, coord.X))
+
+    @property
+    def screen_size(self) -> WindowsCoordinates:
+        """Returns the current size of the console screen buffer, in character columns and rows
+
+        Returns:
+            WindowsCoordinates: The width and height of the screen as WindowsCoordinates.
+        """
+        screen_size: COORD = GetConsoleScreenBufferInfo(self._handle).dwSize
+        return WindowsCoordinates(
+            row=cast(int, screen_size.Y), col=cast(int, screen_size.X)
+        )
+
+    def write_text(self, text: str) -> None:
+        """Write text directly to the terminal without any modification of styles
+
+        Args:
+            text (str): The text to write to the console
+        """
+        self.write(text)
+        self.flush()
+
+    def write_styled(self, text: str, style: Style) -> None:
+        """Write styled text to the terminal.
+
+        Args:
+            text (str): The text to write
+            style (Style): The style of the text
+        """
+        color = style.color
+        bgcolor = style.bgcolor
+        if style.reverse:
+            color, bgcolor = bgcolor, color
+
+        if color:
+            fore = color.downgrade(ColorSystem.WINDOWS).number
+            fore = fore if fore is not None else 7  # Default to ANSI 7: White
+            if style.bold:
+                fore = fore | self.BRIGHT_BIT
+            if style.dim:
+                fore = fore & ~self.BRIGHT_BIT
+            fore = self.ANSI_TO_WINDOWS[fore]
+        else:
+            fore = self._default_fore
+
+        if bgcolor:
+            back = bgcolor.downgrade(ColorSystem.WINDOWS).number
+            back = back if back is not None else 0  # Default to ANSI 0: Black
+            back = self.ANSI_TO_WINDOWS[back]
+        else:
+            back = self._default_back
+
+        assert fore is not None
+        assert back is not None
+
+        SetConsoleTextAttribute(
+            self._handle, attributes=ctypes.c_ushort(fore | (back << 4))
+        )
+        self.write_text(text)
+        SetConsoleTextAttribute(self._handle, attributes=self._default_text)
+
+    def move_cursor_to(self, new_position: WindowsCoordinates) -> None:
+        """Set the position of the cursor
+
+        Args:
+            new_position (WindowsCoordinates): The WindowsCoordinates representing the new position of the cursor.
+        """
+        if new_position.col < 0 or new_position.row < 0:
+            return
+        SetConsoleCursorPosition(self._handle, coords=new_position)
+
+    def erase_line(self) -> None:
+        """Erase all content on the line the cursor is currently located at"""
+        screen_size = self.screen_size
+        cursor_position = self.cursor_position
+        cells_to_erase = screen_size.col
+        start_coordinates = WindowsCoordinates(row=cursor_position.row, col=0)
+        FillConsoleOutputCharacter(
+            self._handle, " ", length=cells_to_erase, start=start_coordinates
+        )
+        FillConsoleOutputAttribute(
+            self._handle,
+            self._default_attrs,
+            length=cells_to_erase,
+            start=start_coordinates,
+        )
+
+    def erase_end_of_line(self) -> None:
+        """Erase all content from the cursor position to the end of that line"""
+        cursor_position = self.cursor_position
+        cells_to_erase = self.screen_size.col - cursor_position.col
+        FillConsoleOutputCharacter(
+            self._handle, " ", length=cells_to_erase, start=cursor_position
+        )
+        FillConsoleOutputAttribute(
+            self._handle,
+            self._default_attrs,
+            length=cells_to_erase,
+            start=cursor_position,
+        )
+
+    def erase_start_of_line(self) -> None:
+        """Erase all content from the cursor position to the start of that line"""
+        row, col = self.cursor_position
+        start = WindowsCoordinates(row, 0)
+        FillConsoleOutputCharacter(self._handle, " ", length=col, start=start)
+        FillConsoleOutputAttribute(
+            self._handle, self._default_attrs, length=col, start=start
+        )
+
+    def move_cursor_up(self) -> None:
+        """Move the cursor up a single cell"""
+        cursor_position = self.cursor_position
+        SetConsoleCursorPosition(
+            self._handle,
+            coords=WindowsCoordinates(
+                row=cursor_position.row - 1, col=cursor_position.col
+            ),
+        )
+
+    def move_cursor_down(self) -> None:
+        """Move the cursor down a single cell"""
+        cursor_position = self.cursor_position
+        SetConsoleCursorPosition(
+            self._handle,
+            coords=WindowsCoordinates(
+                row=cursor_position.row + 1,
+                col=cursor_position.col,
+            ),
+        )
+
+    def move_cursor_forward(self) -> None:
+        """Move the cursor forward a single cell. Wrap to the next line if required."""
+        row, col = self.cursor_position
+        if col == self.screen_size.col - 1:
+            row += 1
+            col = 0
+        else:
+            col += 1
+        SetConsoleCursorPosition(
+            self._handle, coords=WindowsCoordinates(row=row, col=col)
+        )
+
+    def move_cursor_to_column(self, column: int) -> None:
+        """Move cursor to the column specified by the zero-based column index, staying on the same row
+
+        Args:
+            column (int): The zero-based column index to move the cursor to.
+        """
+        row, _ = self.cursor_position
+        SetConsoleCursorPosition(self._handle, coords=WindowsCoordinates(row, column))
+
+    def move_cursor_backward(self) -> None:
+        """Move the cursor backward a single cell. Wrap to the previous line if required."""
+        row, col = self.cursor_position
+        if col == 0:
+            row -= 1
+            col = self.screen_size.col - 1
+        else:
+            col -= 1
+        SetConsoleCursorPosition(
+            self._handle, coords=WindowsCoordinates(row=row, col=col)
+        )
+
+    def hide_cursor(self) -> None:
+        """Hide the cursor"""
+        current_cursor_size = self._get_cursor_size()
+        invisible_cursor = CONSOLE_CURSOR_INFO(dwSize=current_cursor_size, bVisible=0)
+        SetConsoleCursorInfo(self._handle, cursor_info=invisible_cursor)
+
+    def show_cursor(self) -> None:
+        """Show the cursor"""
+        current_cursor_size = self._get_cursor_size()
+        visible_cursor = CONSOLE_CURSOR_INFO(dwSize=current_cursor_size, bVisible=1)
+        SetConsoleCursorInfo(self._handle, cursor_info=visible_cursor)
+
+    def set_title(self, title: str) -> None:
+        """Set the title of the terminal window
+
+        Args:
+            title (str): The new title of the console window
+        """
+        assert len(title) < 255, "Console title must be less than 255 characters"
+        SetConsoleTitle(title)
+
+    def _get_cursor_size(self) -> int:
+        """Get the percentage of the character cell that is filled by the cursor"""
+        cursor_info = CONSOLE_CURSOR_INFO()
+        GetConsoleCursorInfo(self._handle, cursor_info=cursor_info)
+        return int(cursor_info.dwSize)
+
+
+if __name__ == "__main__":
+    handle = GetStdHandle()
+
+    from pip._vendor.rich.console import Console
+
+    console = Console()
+
+    term = LegacyWindowsTerm(sys.stdout)
+    term.set_title("Win32 Console Examples")
+
+    style = Style(color="black", bgcolor="red")
+
+    heading = Style.parse("black on green")
+
+    # Check colour output
+    console.rule("Checking colour output")
+    console.print("[on red]on red!")
+    console.print("[blue]blue!")
+    console.print("[yellow]yellow!")
+    console.print("[bold yellow]bold yellow!")
+    console.print("[bright_yellow]bright_yellow!")
+    console.print("[dim bright_yellow]dim bright_yellow!")
+    console.print("[italic cyan]italic cyan!")
+    console.print("[bold white on blue]bold white on blue!")
+    console.print("[reverse bold white on blue]reverse bold white on blue!")
+    console.print("[bold black on cyan]bold black on cyan!")
+    console.print("[black on green]black on green!")
+    console.print("[blue on green]blue on green!")
+    console.print("[white on black]white on black!")
+    console.print("[black on white]black on white!")
+    console.print("[#1BB152 on #DA812D]#1BB152 on #DA812D!")
+
+    # Check cursor movement
+    console.rule("Checking cursor movement")
+    console.print()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("went back and wrapped to prev line")
+    time.sleep(1)
+    term.move_cursor_up()
+    term.write_text("we go up")
+    time.sleep(1)
+    term.move_cursor_down()
+    term.write_text("and down")
+    time.sleep(1)
+    term.move_cursor_up()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("we went up and back 2")
+    time.sleep(1)
+    term.move_cursor_down()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("we went down and back 2")
+    time.sleep(1)
+
+    # Check erasing of lines
+    term.hide_cursor()
+    console.print()
+    console.rule("Checking line erasing")
+    console.print("\n...Deleting to the start of the line...")
+    term.write_text("The red arrow shows the cursor location, and direction of erase")
+    time.sleep(1)
+    term.move_cursor_to_column(16)
+    term.write_styled("<", Style.parse("black on red"))
+    term.move_cursor_backward()
+    time.sleep(1)
+    term.erase_start_of_line()
+    time.sleep(1)
+
+    console.print("\n\n...And to the end of the line...")
+    term.write_text("The red arrow shows the cursor location, and direction of erase")
+    time.sleep(1)
+
+    term.move_cursor_to_column(16)
+    term.write_styled(">", Style.parse("black on red"))
+    time.sleep(1)
+    term.erase_end_of_line()
+    time.sleep(1)
+
+    console.print("\n\n...Now the whole line will be erased...")
+    term.write_styled("I'm going to disappear!", style=Style.parse("black on cyan"))
+    time.sleep(1)
+    term.erase_line()
+
+    term.show_cursor()
+    print("\n")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e498efabfab0dcf31cd7731f8f821cc423bc4f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py
@@ -0,0 +1,33 @@
+from abc import ABC
+
+
+class RichRenderable(ABC):
+    """An abstract base class for Rich renderables.
+
+    Note that there is no need to extend this class, the intended use is to check if an
+    object supports the Rich renderable protocol. For example::
+
+        if isinstance(my_object, RichRenderable):
+            console.print(my_object)
+
+    """
+
+    @classmethod
+    def __subclasshook__(cls, other: type) -> bool:
+        """Check if this class supports the rich render protocol."""
+        return hasattr(other, "__rich_console__") or hasattr(other, "__rich__")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.text import Text
+
+    t = Text()
+    print(isinstance(Text, RichRenderable))
+    print(isinstance(t, RichRenderable))
+
+    class Foo:
+        pass
+
+    f = Foo()
+    print(isinstance(f, RichRenderable))
+    print(isinstance("", RichRenderable))
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py
new file mode 100644
index 0000000000000000000000000000000000000000..022284b57881d8b133aced5b5a843e6447bb4e0b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py
@@ -0,0 +1,93 @@
+from typing import Optional, Union
+
+from .color import Color
+from .console import Console, ConsoleOptions, RenderResult
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .segment import Segment
+from .style import Style
+
+# There are left-aligned characters for 1/8 to 7/8, but
+# the right-aligned characters exist only for 1/8 and 4/8.
+BEGIN_BLOCK_ELEMENTS = ["█", "█", "█", "▐", "▐", "▐", "▕", "▕"]
+END_BLOCK_ELEMENTS = [" ", "▏", "▎", "▍", "▌", "▋", "▊", "▉"]
+FULL_BLOCK = "█"
+
+
+class Bar(JupyterMixin):
+    """Renders a solid block bar.
+
+    Args:
+        size (float): Value for the end of the bar.
+        begin (float): Begin point (between 0 and size, inclusive).
+        end (float): End point (between 0 and size, inclusive).
+        width (int, optional): Width of the bar, or ``None`` for maximum width. Defaults to None.
+        color (Union[Color, str], optional): Color of the bar. Defaults to "default".
+        bgcolor (Union[Color, str], optional): Color of bar background. Defaults to "default".
+    """
+
+    def __init__(
+        self,
+        size: float,
+        begin: float,
+        end: float,
+        *,
+        width: Optional[int] = None,
+        color: Union[Color, str] = "default",
+        bgcolor: Union[Color, str] = "default",
+    ):
+        self.size = size
+        self.begin = max(begin, 0)
+        self.end = min(end, size)
+        self.width = width
+        self.style = Style(color=color, bgcolor=bgcolor)
+
+    def __repr__(self) -> str:
+        return f"Bar({self.size}, {self.begin}, {self.end})"
+
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        width = min(
+            self.width if self.width is not None else options.max_width,
+            options.max_width,
+        )
+
+        if self.begin >= self.end:
+            yield Segment(" " * width, self.style)
+            yield Segment.line()
+            return
+
+        prefix_complete_eights = int(width * 8 * self.begin / self.size)
+        prefix_bar_count = prefix_complete_eights // 8
+        prefix_eights_count = prefix_complete_eights % 8
+
+        body_complete_eights = int(width * 8 * self.end / self.size)
+        body_bar_count = body_complete_eights // 8
+        body_eights_count = body_complete_eights % 8
+
+        # When start and end fall into the same cell, we ideally should render
+        # a symbol that's "center-aligned", but there is no good symbol in Unicode.
+        # In this case, we fall back to right-aligned block symbol for simplicity.
+
+        prefix = " " * prefix_bar_count
+        if prefix_eights_count:
+            prefix += BEGIN_BLOCK_ELEMENTS[prefix_eights_count]
+
+        body = FULL_BLOCK * body_bar_count
+        if body_eights_count:
+            body += END_BLOCK_ELEMENTS[body_eights_count]
+
+        suffix = " " * (width - len(body))
+
+        yield Segment(prefix + body[len(prefix) :] + suffix, self.style)
+        yield Segment.line()
+
+    def __rich_measure__(
+        self, console: Console, options: ConsoleOptions
+    ) -> Measurement:
+        return (
+            Measurement(self.width, self.width)
+            if self.width is not None
+            else Measurement(4, options.max_width)
+        )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bcbe53ef59373c608e62ea285536f8b22b47ecb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py
@@ -0,0 +1,34 @@
+class ConsoleError(Exception):
+    """An error in console operation."""
+
+
+class StyleError(Exception):
+    """An error in styles."""
+
+
+class StyleSyntaxError(ConsoleError):
+    """Style was badly formatted."""
+
+
+class MissingStyle(StyleError):
+    """No such style."""
+
+
+class StyleStackError(ConsoleError):
+    """Style stack is invalid."""
+
+
+class NotRenderableError(ConsoleError):
+    """Object is not renderable."""
+
+
+class MarkupError(ConsoleError):
+    """Markup was badly formatted."""
+
+
+class LiveError(ConsoleError):
+    """Error related to Live display."""
+
+
+class NoAltScreen(ConsoleError):
+    """Alt screen mode was required."""
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f118e20103174993b865cfb43ac6b6e00296a4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+"""Functions for reporting filesizes. Borrowed from https://github.com/PyFilesystem/pyfilesystem2
+
+The functions declared in this module should cover the different
+use cases needed to generate a string representation of a file size
+using several different units. Since there are many standards regarding
+file size units, three different functions have been implemented.
+
+See Also:
+    * `Wikipedia: Binary prefix <https://en.wikipedia.org/wiki/Binary_prefix>`_
+
+"""
+
+__all__ = ["decimal"]
+
+from typing import Iterable, List, Optional, Tuple
+
+
+def _to_str(
+    size: int,
+    suffixes: Iterable[str],
+    base: int,
+    *,
+    precision: Optional[int] = 1,
+    separator: Optional[str] = " ",
+) -> str:
+    if size == 1:
+        return "1 byte"
+    elif size < base:
+        return "{:,} bytes".format(size)
+
+    for i, suffix in enumerate(suffixes, 2):  # noqa: B007
+        unit = base**i
+        if size < unit:
+            break
+    return "{:,.{precision}f}{separator}{}".format(
+        (base * size / unit),
+        suffix,
+        precision=precision,
+        separator=separator,
+    )
+
+
+def pick_unit_and_suffix(size: int, suffixes: List[str], base: int) -> Tuple[int, str]:
+    """Pick a suffix and base for the given size."""
+    for i, suffix in enumerate(suffixes):
+        unit = base**i
+        if size < unit * base:
+            break
+    return unit, suffix
+
+
+def decimal(
+    size: int,
+    *,
+    precision: Optional[int] = 1,
+    separator: Optional[str] = " ",
+) -> str:
+    """Convert a filesize in to a string (powers of 1000, SI prefixes).
+
+    In this convention, ``1000 B = 1 kB``.
+
+    This is typically the format used to advertise the storage
+    capacity of USB flash drives and the like (*256 MB* meaning
+    actually a storage capacity of more than *256 000 000 B*),
+    or used by **Mac OS X** since v10.6 to report file sizes.
+
+    Arguments:
+        int (size): A file size.
+        int (precision): The number of decimal places to include (default = 1).
+        str (separator): The string to separate the value from the units (default = " ").
+
+    Returns:
+        `str`: A string containing a abbreviated file size and units.
+
+    Example:
+        >>> filesize.decimal(30000)
+        '30.0 kB'
+        >>> filesize.decimal(30000, precision=2, separator="")
+        '30.00kB'
+
+    """
+    return _to_str(
+        size,
+        ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"),
+        1000,
+        precision=precision,
+        separator=separator,
+    )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6171878f823183ee8f77195b3e944be222006dc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py
@@ -0,0 +1,251 @@
+import re
+from ast import literal_eval
+from operator import attrgetter
+from typing import Callable, Iterable, List, Match, NamedTuple, Optional, Tuple, Union
+
+from ._emoji_replace import _emoji_replace
+from .emoji import EmojiVariant
+from .errors import MarkupError
+from .style import Style
+from .text import Span, Text
+
+RE_TAGS = re.compile(
+    r"""((\\*)\[([a-z#/@][^[]*?)])""",
+    re.VERBOSE,
+)
+
+RE_HANDLER = re.compile(r"^([\w.]*?)(\(.*?\))?$")
+
+
+class Tag(NamedTuple):
+    """A tag in console markup."""
+
+    name: str
+    """The tag name. e.g. 'bold'."""
+    parameters: Optional[str]
+    """Any additional parameters after the name."""
+
+    def __str__(self) -> str:
+        return (
+            self.name if self.parameters is None else f"{self.name} {self.parameters}"
+        )
+
+    @property
+    def markup(self) -> str:
+        """Get the string representation of this tag."""
+        return (
+            f"[{self.name}]"
+            if self.parameters is None
+            else f"[{self.name}={self.parameters}]"
+        )
+
+
+_ReStringMatch = Match[str]  # regex match object
+_ReSubCallable = Callable[[_ReStringMatch], str]  # Callable invoked by re.sub
+_EscapeSubMethod = Callable[[_ReSubCallable, str], str]  # Sub method of a compiled re
+
+
+def escape(
+    markup: str,
+    _escape: _EscapeSubMethod = re.compile(r"(\\*)(\[[a-z#/@][^[]*?])").sub,
+) -> str:
+    """Escapes text so that it won't be interpreted as markup.
+
+    Args:
+        markup (str): Content to be inserted in to markup.
+
+    Returns:
+        str: Markup with square brackets escaped.
+    """
+
+    def escape_backslashes(match: Match[str]) -> str:
+        """Called by re.sub replace matches."""
+        backslashes, text = match.groups()
+        return f"{backslashes}{backslashes}\\{text}"
+
+    markup = _escape(escape_backslashes, markup)
+    if markup.endswith("\\") and not markup.endswith("\\\\"):
+        return markup + "\\"
+
+    return markup
+
+
+def _parse(markup: str) -> Iterable[Tuple[int, Optional[str], Optional[Tag]]]:
+    """Parse markup in to an iterable of tuples of (position, text, tag).
+
+    Args:
+        markup (str): A string containing console markup
+
+    """
+    position = 0
+    _divmod = divmod
+    _Tag = Tag
+    for match in RE_TAGS.finditer(markup):
+        full_text, escapes, tag_text = match.groups()
+        start, end = match.span()
+        if start > position:
+            yield start, markup[position:start], None
+        if escapes:
+            backslashes, escaped = _divmod(len(escapes), 2)
+            if backslashes:
+                # Literal backslashes
+                yield start, "\\" * backslashes, None
+                start += backslashes * 2
+            if escaped:
+                # Escape of tag
+                yield start, full_text[len(escapes) :], None
+                position = end
+                continue
+        text, equals, parameters = tag_text.partition("=")
+        yield start, None, _Tag(text, parameters if equals else None)
+        position = end
+    if position < len(markup):
+        yield position, markup[position:], None
+
+
+def render(
+    markup: str,
+    style: Union[str, Style] = "",
+    emoji: bool = True,
+    emoji_variant: Optional[EmojiVariant] = None,
+) -> Text:
+    """Render console markup in to a Text instance.
+
+    Args:
+        markup (str): A string containing console markup.
+        style: (Union[str, Style]): The style to use.
+        emoji (bool, optional): Also render emoji code. Defaults to True.
+        emoji_variant (str, optional): Optional emoji variant, either "text" or "emoji". Defaults to None.
+
+
+    Raises:
+        MarkupError: If there is a syntax error in the markup.
+
+    Returns:
+        Text: A test instance.
+    """
+    emoji_replace = _emoji_replace
+    if "[" not in markup:
+        return Text(
+            emoji_replace(markup, default_variant=emoji_variant) if emoji else markup,
+            style=style,
+        )
+    text = Text(style=style)
+    append = text.append
+    normalize = Style.normalize
+
+    style_stack: List[Tuple[int, Tag]] = []
+    pop = style_stack.pop
+
+    spans: List[Span] = []
+    append_span = spans.append
+
+    _Span = Span
+    _Tag = Tag
+
+    def pop_style(style_name: str) -> Tuple[int, Tag]:
+        """Pop tag matching given style name."""
+        for index, (_, tag) in enumerate(reversed(style_stack), 1):
+            if tag.name == style_name:
+                return pop(-index)
+        raise KeyError(style_name)
+
+    for position, plain_text, tag in _parse(markup):
+        if plain_text is not None:
+            # Handle open brace escapes, where the brace is not part of a tag.
+            plain_text = plain_text.replace("\\[", "[")
+            append(emoji_replace(plain_text) if emoji else plain_text)
+        elif tag is not None:
+            if tag.name.startswith("/"):  # Closing tag
+                style_name = tag.name[1:].strip()
+
+                if style_name:  # explicit close
+                    style_name = normalize(style_name)
+                    try:
+                        start, open_tag = pop_style(style_name)
+                    except KeyError:
+                        raise MarkupError(
+                            f"closing tag '{tag.markup}' at position {position} doesn't match any open tag"
+                        ) from None
+                else:  # implicit close
+                    try:
+                        start, open_tag = pop()
+                    except IndexError:
+                        raise MarkupError(
+                            f"closing tag '[/]' at position {position} has nothing to close"
+                        ) from None
+
+                if open_tag.name.startswith("@"):
+                    if open_tag.parameters:
+                        handler_name = ""
+                        parameters = open_tag.parameters.strip()
+                        handler_match = RE_HANDLER.match(parameters)
+                        if handler_match is not None:
+                            handler_name, match_parameters = handler_match.groups()
+                            parameters = (
+                                "()" if match_parameters is None else match_parameters
+                            )
+
+                        try:
+                            meta_params = literal_eval(parameters)
+                        except SyntaxError as error:
+                            raise MarkupError(
+                                f"error parsing {parameters!r} in {open_tag.parameters!r}; {error.msg}"
+                            )
+                        except Exception as error:
+                            raise MarkupError(
+                                f"error parsing {open_tag.parameters!r}; {error}"
+                            ) from None
+
+                        if handler_name:
+                            meta_params = (
+                                handler_name,
+                                meta_params
+                                if isinstance(meta_params, tuple)
+                                else (meta_params,),
+                            )
+
+                    else:
+                        meta_params = ()
+
+                    append_span(
+                        _Span(
+                            start, len(text), Style(meta={open_tag.name: meta_params})
+                        )
+                    )
+                else:
+                    append_span(_Span(start, len(text), str(open_tag)))
+
+            else:  # Opening tag
+                normalized_tag = _Tag(normalize(tag.name), tag.parameters)
+                style_stack.append((len(text), normalized_tag))
+
+    text_length = len(text)
+    while style_stack:
+        start, tag = style_stack.pop()
+        style = str(tag)
+        if style:
+            append_span(_Span(start, text_length, style))
+
+    text.spans = sorted(spans[::-1], key=attrgetter("start"))
+    return text
+
+
+if __name__ == "__main__":  # pragma: no cover
+    MARKUP = [
+        "[red]Hello World[/red]",
+        "[magenta]Hello [b]World[/b]",
+        "[bold]Bold[italic] bold and italic [/bold]italic[/italic]",
+        "Click [link=https://www.willmcgugan.com]here[/link] to visit my Blog",
+        ":warning-emoji: [bold red blink] DANGER![/]",
+    ]
+
+    from pip._vendor.rich import print
+    from pip._vendor.rich.table import Table
+
+    grid = Table("Markup", "Result", padding=(0, 1))
+
+    for markup in MARKUP:
+        grid.add_row(Text(markup), markup)
+
+    print(grid)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f4c84cf0b21d1bb518a40b22f39716a189f2fa
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py
@@ -0,0 +1,312 @@
+from typing import TYPE_CHECKING, Optional
+
+from .align import AlignMethod
+from .box import ROUNDED, Box
+from .cells import cell_len
+from .jupyter import JupyterMixin
+from .measure import Measurement, measure_renderables
+from .padding import Padding, PaddingDimensions
+from .segment import Segment
+from .style import Style, StyleType
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import Console, ConsoleOptions, RenderableType, RenderResult
+
+
+class Panel(JupyterMixin):
+    """A console renderable that draws a border around its contents.
+
+    Example:
+        >>> console.print(Panel("Hello, World!"))
+
+    Args:
+        renderable (RenderableType): A console renderable object.
+        box (Box, optional): A Box instance that defines the look of the border (see :ref:`appendix_box`.
+            Defaults to box.ROUNDED.
+        safe_box (bool, optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        expand (bool, optional): If True the panel will stretch to fill the console
+            width, otherwise it will be sized to fit the contents. Defaults to True.
+        style (str, optional): The style of the panel (border and contents). Defaults to "none".
+        border_style (str, optional): The style of the border. Defaults to "none".
+        width (Optional[int], optional): Optional width of panel. Defaults to None to auto-detect.
+        height (Optional[int], optional): Optional height of panel. Defaults to None to auto-detect.
+        padding (Optional[PaddingDimensions]): Optional padding around renderable. Defaults to 0.
+        highlight (bool, optional): Enable automatic highlighting of panel title (if str). Defaults to False.
+    """
+
+    def __init__(
+        self,
+        renderable: "RenderableType",
+        box: Box = ROUNDED,
+        *,
+        title: Optional[TextType] = None,
+        title_align: AlignMethod = "center",
+        subtitle: Optional[TextType] = None,
+        subtitle_align: AlignMethod = "center",
+        safe_box: Optional[bool] = None,
+        expand: bool = True,
+        style: StyleType = "none",
+        border_style: StyleType = "none",
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        padding: PaddingDimensions = (0, 1),
+        highlight: bool = False,
+    ) -> None:
+        self.renderable = renderable
+        self.box = box
+        self.title = title
+        self.title_align: AlignMethod = title_align
+        self.subtitle = subtitle
+        self.subtitle_align = subtitle_align
+        self.safe_box = safe_box
+        self.expand = expand
+        self.style = style
+        self.border_style = border_style
+        self.width = width
+        self.height = height
+        self.padding = padding
+        self.highlight = highlight
+
+    @classmethod
+    def fit(
+        cls,
+        renderable: "RenderableType",
+        box: Box = ROUNDED,
+        *,
+        title: Optional[TextType] = None,
+        title_align: AlignMethod = "center",
+        subtitle: Optional[TextType] = None,
+        subtitle_align: AlignMethod = "center",
+        safe_box: Optional[bool] = None,
+        style: StyleType = "none",
+        border_style: StyleType = "none",
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        padding: PaddingDimensions = (0, 1),
+        highlight: bool = False,
+    ) -> "Panel":
+        """An alternative constructor that sets expand=False."""
+        return cls(
+            renderable,
+            box,
+            title=title,
+            title_align=title_align,
+            subtitle=subtitle,
+            subtitle_align=subtitle_align,
+            safe_box=safe_box,
+            style=style,
+            border_style=border_style,
+            width=width,
+            height=height,
+            padding=padding,
+            highlight=highlight,
+            expand=False,
+        )
+
+    @property
+    def _title(self) -> Optional[Text]:
+        if self.title:
+            title_text = (
+                Text.from_markup(self.title)
+                if isinstance(self.title, str)
+                else self.title.copy()
+            )
+            title_text.end = ""
+            title_text.plain = title_text.plain.replace("\n", " ")
+            title_text.no_wrap = True
+            title_text.expand_tabs()
+            title_text.pad(1)
+            return title_text
+        return None
+
+    @property
+    def _subtitle(self) -> Optional[Text]:
+        if self.subtitle:
+            subtitle_text = (
+                Text.from_markup(self.subtitle)
+                if isinstance(self.subtitle, str)
+                else self.subtitle.copy()
+            )
+            subtitle_text.end = ""
+            subtitle_text.plain = subtitle_text.plain.replace("\n", " ")
+            subtitle_text.no_wrap = True
+            subtitle_text.expand_tabs()
+            subtitle_text.pad(1)
+            return subtitle_text
+        return None
+
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        _padding = Padding.unpack(self.padding)
+        renderable = (
+            Padding(self.renderable, _padding) if any(_padding) else self.renderable
+        )
+        style = console.get_style(self.style)
+        border_style = style + console.get_style(self.border_style)
+        width = (
+            options.max_width
+            if self.width is None
+            else min(options.max_width, self.width)
+        )
+
+        safe_box: bool = console.safe_box if self.safe_box is None else self.safe_box
+        box = self.box.substitute(options, safe=safe_box)
+
+        def align_text(
+            text: Text, width: int, align: str, character: str, style: Style
+        ) -> Text:
+            """Gets new aligned text.
+
+            Args:
+                text (Text): Title or subtitle text.
+                width (int): Desired width.
+                align (str): Alignment.
+                character (str): Character for alignment.
+                style (Style): Border style
+
+            Returns:
+                Text: New text instance
+            """
+            text = text.copy()
+            text.truncate(width)
+            excess_space = width - cell_len(text.plain)
+            if excess_space:
+                if align == "left":
+                    return Text.assemble(
+                        text,
+                        (character * excess_space, style),
+                        no_wrap=True,
+                        end="",
+                    )
+                elif align == "center":
+                    left = excess_space // 2
+                    return Text.assemble(
+                        (character * left, style),
+                        text,
+                        (character * (excess_space - left), style),
+                        no_wrap=True,
+                        end="",
+                    )
+                else:
+                    return Text.assemble(
+                        (character * excess_space, style),
+                        text,
+                        no_wrap=True,
+                        end="",
+                    )
+            return text
+
+        title_text = self._title
+        if title_text is not None:
+            title_text.stylize_before(border_style)
+
+        child_width = (
+            width - 2
+            if self.expand
+            else console.measure(
+                renderable, options=options.update_width(width - 2)
+            ).maximum
+        )
+        child_height = self.height or options.height or None
+        if child_height:
+            child_height -= 2
+        if title_text is not None:
+            child_width = min(
+                options.max_width - 2, max(child_width, title_text.cell_len + 2)
+            )
+
+        width = child_width + 2
+        child_options = options.update(
+            width=child_width, height=child_height, highlight=self.highlight
+        )
+        lines = console.render_lines(renderable, child_options, style=style)
+
+        line_start = Segment(box.mid_left, border_style)
+        line_end = Segment(f"{box.mid_right}", border_style)
+        new_line = Segment.line()
+        if title_text is None or width <= 4:
+            yield Segment(box.get_top([width - 2]), border_style)
+        else:
+            title_text = align_text(
+                title_text,
+                width - 4,
+                self.title_align,
+                box.top,
+                border_style,
+            )
+            yield Segment(box.top_left + box.top, border_style)
+            yield from console.render(title_text, child_options.update_width(width - 4))
+            yield Segment(box.top + box.top_right, border_style)
+
+        yield new_line
+        for line in lines:
+            yield line_start
+            yield from line
+            yield line_end
+            yield new_line
+
+        subtitle_text = self._subtitle
+        if subtitle_text is not None:
+            subtitle_text.stylize_before(border_style)
+
+        if subtitle_text is None or width <= 4:
+            yield Segment(box.get_bottom([width - 2]), border_style)
+        else:
+            subtitle_text = align_text(
+                subtitle_text,
+                width - 4,
+                self.subtitle_align,
+                box.bottom,
+                border_style,
+            )
+            yield Segment(box.bottom_left + box.bottom, border_style)
+            yield from console.render(
+                subtitle_text, child_options.update_width(width - 4)
+            )
+            yield Segment(box.bottom + box.bottom_right, border_style)
+
+        yield new_line
+
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "Measurement":
+        _title = self._title
+        _, right, _, left = Padding.unpack(self.padding)
+        padding = left + right
+        renderables = [self.renderable, _title] if _title else [self.renderable]
+
+        if self.width is None:
+            width = (
+                measure_renderables(
+                    console,
+                    options.update_width(options.max_width - padding - 2),
+                    renderables,
+                ).maximum
+                + padding
+                + 2
+            )
+        else:
+            width = self.width
+        return Measurement(width, width)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from .console import Console
+
+    c = Console()
+
+    from .box import DOUBLE, ROUNDED
+    from .padding import Padding
+
+    p = Panel(
+        "Hello, World!",
+        title="rich.Panel",
+        style="white on blue",
+        box=DOUBLE,
+        padding=1,
+    )
+
+    c.print()
+    c.print(p)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c718ebf5906c7aaca9fb14ac34dd80ae8bc01e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py
@@ -0,0 +1,1000 @@
+from dataclasses import dataclass, field, replace
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+from . import box, errors
+from ._loop import loop_first_last, loop_last
+from ._pick import pick_bool
+from ._ratio import ratio_distribute, ratio_reduce
+from .align import VerticalAlignMethod
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .padding import Padding, PaddingDimensions
+from .protocol import is_renderable
+from .segment import Segment
+from .style import Style, StyleType
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import (
+        Console,
+        ConsoleOptions,
+        JustifyMethod,
+        OverflowMethod,
+        RenderableType,
+        RenderResult,
+    )
+
+
+@dataclass
+class Column:
+    """Defines a column within a ~Table.
+
+    Args:
+        title (Union[str, Text], optional): The title of the table rendered at the top. Defaults to None.
+        caption (Union[str, Text], optional): The table caption rendered below. Defaults to None.
+        width (int, optional): The width in characters of the table, or ``None`` to automatically fit. Defaults to None.
+        min_width (Optional[int], optional): The minimum width of the table, or ``None`` for no minimum. Defaults to None.
+        box (box.Box, optional): One of the constants in box.py used to draw the edges (see :ref:`appendix_box`), or ``None`` for no box lines. Defaults to box.HEAVY_HEAD.
+        safe_box (Optional[bool], optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        padding (PaddingDimensions, optional): Padding for cells (top, right, bottom, left). Defaults to (0, 1).
+        collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to False.
+        pad_edge (bool, optional): Enable padding of edge cells. Defaults to True.
+        expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+        show_header (bool, optional): Show a header row. Defaults to True.
+        show_footer (bool, optional): Show a footer row. Defaults to False.
+        show_edge (bool, optional): Draw a box around the outside of the table. Defaults to True.
+        show_lines (bool, optional): Draw lines between every row. Defaults to False.
+        leading (bool, optional): Number of blank lines between rows (precludes ``show_lines``). Defaults to 0.
+        style (Union[str, Style], optional): Default style for the table. Defaults to "none".
+        row_styles (List[Union, str], optional): Optional list of row styles, if more than one style is given then the styles will alternate. Defaults to None.
+        header_style (Union[str, Style], optional): Style of the header. Defaults to "table.header".
+        footer_style (Union[str, Style], optional): Style of the footer. Defaults to "table.footer".
+        border_style (Union[str, Style], optional): Style of the border. Defaults to None.
+        title_style (Union[str, Style], optional): Style of the title. Defaults to None.
+        caption_style (Union[str, Style], optional): Style of the caption. Defaults to None.
+        title_justify (str, optional): Justify method for title. Defaults to "center".
+        caption_justify (str, optional): Justify method for caption. Defaults to "center".
+        highlight (bool, optional): Highlight cell contents (if str). Defaults to False.
+    """
+
+    header: "RenderableType" = ""
+    """RenderableType: Renderable for the header (typically a string)"""
+
+    footer: "RenderableType" = ""
+    """RenderableType: Renderable for the footer (typically a string)"""
+
+    header_style: StyleType = ""
+    """StyleType: The style of the header."""
+
+    footer_style: StyleType = ""
+    """StyleType: The style of the footer."""
+
+    style: StyleType = ""
+    """StyleType: The style of the column."""
+
+    justify: "JustifyMethod" = "left"
+    """str: How to justify text within the column ("left", "center", "right", or "full")"""
+
+    vertical: "VerticalAlignMethod" = "top"
+    """str: How to vertically align content ("top", "middle", or "bottom")"""
+
+    overflow: "OverflowMethod" = "ellipsis"
+    """str: Overflow method."""
+
+    width: Optional[int] = None
+    """Optional[int]: Width of the column, or ``None`` (default) to auto calculate width."""
+
+    min_width: Optional[int] = None
+    """Optional[int]: Minimum width of column, or ``None`` for no minimum. Defaults to None."""
+
+    max_width: Optional[int] = None
+    """Optional[int]: Maximum width of column, or ``None`` for no maximum. Defaults to None."""
+
+    ratio: Optional[int] = None
+    """Optional[int]: Ratio to use when calculating column width, or ``None`` (default) to adapt to column contents."""
+
+    no_wrap: bool = False
+    """bool: Prevent wrapping of text within the column. Defaults to ``False``."""
+
+    _index: int = 0
+    """Index of column."""
+
+    _cells: List["RenderableType"] = field(default_factory=list)
+
+    def copy(self) -> "Column":
+        """Return a copy of this Column."""
+        return replace(self, _cells=[])
+
+    @property
+    def cells(self) -> Iterable["RenderableType"]:
+        """Get all cells in the column, not including header."""
+        yield from self._cells
+
+    @property
+    def flexible(self) -> bool:
+        """Check if this column is flexible."""
+        return self.ratio is not None
+
+
+@dataclass
+class Row:
+    """Information regarding a row."""
+
+    style: Optional[StyleType] = None
+    """Style to apply to row."""
+
+    end_section: bool = False
+    """Indicated end of section, which will force a line beneath the row."""
+
+
+class _Cell(NamedTuple):
+    """A single cell in a table."""
+
+    style: StyleType
+    """Style to apply to cell."""
+    renderable: "RenderableType"
+    """Cell renderable."""
+    vertical: VerticalAlignMethod
+    """Cell vertical alignment."""
+
+
+class Table(JupyterMixin):
+    """A console renderable to draw a table.
+
+    Args:
+        *headers (Union[Column, str]): Column headers, either as a string, or :class:`~rich.table.Column` instance.
+        title (Union[str, Text], optional): The title of the table rendered at the top. Defaults to None.
+        caption (Union[str, Text], optional): The table caption rendered below. Defaults to None.
+        width (int, optional): The width in characters of the table, or ``None`` to automatically fit. Defaults to None.
+        min_width (Optional[int], optional): The minimum width of the table, or ``None`` for no minimum. Defaults to None.
+        box (box.Box, optional): One of the constants in box.py used to draw the edges (see :ref:`appendix_box`), or ``None`` for no box lines. Defaults to box.HEAVY_HEAD.
+        safe_box (Optional[bool], optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        padding (PaddingDimensions, optional): Padding for cells (top, right, bottom, left). Defaults to (0, 1).
+        collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to False.
+        pad_edge (bool, optional): Enable padding of edge cells. Defaults to True.
+        expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+        show_header (bool, optional): Show a header row. Defaults to True.
+        show_footer (bool, optional): Show a footer row. Defaults to False.
+        show_edge (bool, optional): Draw a box around the outside of the table. Defaults to True.
+        show_lines (bool, optional): Draw lines between every row. Defaults to False.
+        leading (bool, optional): Number of blank lines between rows (precludes ``show_lines``). Defaults to 0.
+        style (Union[str, Style], optional): Default style for the table. Defaults to "none".
+        row_styles (List[Union, str], optional): Optional list of row styles, if more than one style is given then the styles will alternate. Defaults to None.
+        header_style (Union[str, Style], optional): Style of the header. Defaults to "table.header".
+        footer_style (Union[str, Style], optional): Style of the footer. Defaults to "table.footer".
+        border_style (Union[str, Style], optional): Style of the border. Defaults to None.
+        title_style (Union[str, Style], optional): Style of the title. Defaults to None.
+        caption_style (Union[str, Style], optional): Style of the caption. Defaults to None.
+        title_justify (str, optional): Justify method for title. Defaults to "center".
+        caption_justify (str, optional): Justify method for caption. Defaults to "center".
+        highlight (bool, optional): Highlight cell contents (if str). Defaults to False.
+    """
+
+    columns: List[Column]
+    rows: List[Row]
+
+    def __init__(
+        self,
+        *headers: Union[Column, str],
+        title: Optional[TextType] = None,
+        caption: Optional[TextType] = None,
+        width: Optional[int] = None,
+        min_width: Optional[int] = None,
+        box: Optional[box.Box] = box.HEAVY_HEAD,
+        safe_box: Optional[bool] = None,
+        padding: PaddingDimensions = (0, 1),
+        collapse_padding: bool = False,
+        pad_edge: bool = True,
+        expand: bool = False,
+        show_header: bool = True,
+        show_footer: bool = False,
+        show_edge: bool = True,
+        show_lines: bool = False,
+        leading: int = 0,
+        style: StyleType = "none",
+        row_styles: Optional[Iterable[StyleType]] = None,
+        header_style: Optional[StyleType] = "table.header",
+        footer_style: Optional[StyleType] = "table.footer",
+        border_style: Optional[StyleType] = None,
+        title_style: Optional[StyleType] = None,
+        caption_style: Optional[StyleType] = None,
+        title_justify: "JustifyMethod" = "center",
+        caption_justify: "JustifyMethod" = "center",
+        highlight: bool = False,
+    ) -> None:
+        self.columns: List[Column] = []
+        self.rows: List[Row] = []
+        self.title = title
+        self.caption = caption
+        self.width = width
+        self.min_width = min_width
+        self.box = box
+        self.safe_box = safe_box
+        self._padding = Padding.unpack(padding)
+        self.pad_edge = pad_edge
+        self._expand = expand
+        self.show_header = show_header
+        self.show_footer = show_footer
+        self.show_edge = show_edge
+        self.show_lines = show_lines
+        self.leading = leading
+        self.collapse_padding = collapse_padding
+        self.style = style
+        self.header_style = header_style or ""
+        self.footer_style = footer_style or ""
+        self.border_style = border_style
+        self.title_style = title_style
+        self.caption_style = caption_style
+        self.title_justify: "JustifyMethod" = title_justify
+        self.caption_justify: "JustifyMethod" = caption_justify
+        self.highlight = highlight
+        self.row_styles: Sequence[StyleType] = list(row_styles or [])
+        append_column = self.columns.append
+        for header in headers:
+            if isinstance(header, str):
+                self.add_column(header=header)
+            else:
+                header._index = len(self.columns)
+                append_column(header)
+
+    @classmethod
+    def grid(
+        cls,
+        *headers: Union[Column, str],
+        padding: PaddingDimensions = 0,
+        collapse_padding: bool = True,
+        pad_edge: bool = False,
+        expand: bool = False,
+    ) -> "Table":
+        """Get a table with no lines, headers, or footer.
+
+        Args:
+            *headers (Union[Column, str]): Column headers, either as a string, or :class:`~rich.table.Column` instance.
+            padding (PaddingDimensions, optional): Get padding around cells. Defaults to 0.
+            collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to True.
+            pad_edge (bool, optional): Enable padding around edges of table. Defaults to False.
+            expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+
+        Returns:
+            Table: A table instance.
+        """
+        return cls(
+            *headers,
+            box=None,
+            padding=padding,
+            collapse_padding=collapse_padding,
+            show_header=False,
+            show_footer=False,
+            show_edge=False,
+            pad_edge=pad_edge,
+            expand=expand,
+        )
+
+    @property
+    def expand(self) -> bool:
+        """Setting a non-None self.width implies expand."""
+        return self._expand or self.width is not None
+
+    @expand.setter
+    def expand(self, expand: bool) -> None:
+        """Set expand."""
+        self._expand = expand
+
+    @property
+    def _extra_width(self) -> int:
+        """Get extra width to add to cell content."""
+        width = 0
+        if self.box and self.show_edge:
+            width += 2
+        if self.box:
+            width += len(self.columns) - 1
+        return width
+
+    @property
+    def row_count(self) -> int:
+        """Get the current number of rows."""
+        return len(self.rows)
+
+    def get_row_style(self, console: "Console", index: int) -> StyleType:
+        """Get the current row style."""
+        style = Style.null()
+        if self.row_styles:
+            style += console.get_style(self.row_styles[index % len(self.row_styles)])
+        row_style = self.rows[index].style
+        if row_style is not None:
+            style += console.get_style(row_style)
+        return style
+
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> Measurement:
+        max_width = options.max_width
+        if self.width is not None:
+            max_width = self.width
+        if max_width < 0:
+            return Measurement(0, 0)
+
+        extra_width = self._extra_width
+        max_width = sum(
+            self._calculate_column_widths(
+                console, options.update_width(max_width - extra_width)
+            )
+        )
+        _measure_column = self._measure_column
+
+        measurements = [
+            _measure_column(console, options.update_width(max_width), column)
+            for column in self.columns
+        ]
+        minimum_width = (
+            sum(measurement.minimum for measurement in measurements) + extra_width
+        )
+        maximum_width = (
+            sum(measurement.maximum for measurement in measurements) + extra_width
+            if (self.width is None)
+            else self.width
+        )
+        measurement = Measurement(minimum_width, maximum_width)
+        measurement = measurement.clamp(self.min_width)
+        return measurement
+
+    @property
+    def padding(self) -> Tuple[int, int, int, int]:
+        """Get cell padding."""
+        return self._padding
+
+    @padding.setter
+    def padding(self, padding: PaddingDimensions) -> "Table":
+        """Set cell padding."""
+        self._padding = Padding.unpack(padding)
+        return self
+
+    def add_column(
+        self,
+        header: "RenderableType" = "",
+        footer: "RenderableType" = "",
+        *,
+        header_style: Optional[StyleType] = None,
+        footer_style: Optional[StyleType] = None,
+        style: Optional[StyleType] = None,
+        justify: "JustifyMethod" = "left",
+        vertical: "VerticalAlignMethod" = "top",
+        overflow: "OverflowMethod" = "ellipsis",
+        width: Optional[int] = None,
+        min_width: Optional[int] = None,
+        max_width: Optional[int] = None,
+        ratio: Optional[int] = None,
+        no_wrap: bool = False,
+    ) -> None:
+        """Add a column to the table.
+
+        Args:
+            header (RenderableType, optional): Text or renderable for the header.
+                Defaults to "".
+            footer (RenderableType, optional): Text or renderable for the footer.
+                Defaults to "".
+            header_style (Union[str, Style], optional): Style for the header, or None for default. Defaults to None.
+            footer_style (Union[str, Style], optional): Style for the footer, or None for default. Defaults to None.
+            style (Union[str, Style], optional): Style for the column cells, or None for default. Defaults to None.
+            justify (JustifyMethod, optional): Alignment for cells. Defaults to "left".
+            vertical (VerticalAlignMethod, optional): Vertical alignment, one of "top", "middle", or "bottom". Defaults to "top".
+            overflow (OverflowMethod): Overflow method: "crop", "fold", "ellipsis". Defaults to "ellipsis".
+            width (int, optional): Desired width of column in characters, or None to fit to contents. Defaults to None.
+            min_width (Optional[int], optional): Minimum width of column, or ``None`` for no minimum. Defaults to None.
+            max_width (Optional[int], optional): Maximum width of column, or ``None`` for no maximum. Defaults to None.
+            ratio (int, optional): Flexible ratio for the column (requires ``Table.expand`` or ``Table.width``). Defaults to None.
+            no_wrap (bool, optional): Set to ``True`` to disable wrapping of this column.
+        """
+
+        column = Column(
+            _index=len(self.columns),
+            header=header,
+            footer=footer,
+            header_style=header_style or "",
+            footer_style=footer_style or "",
+            style=style or "",
+            justify=justify,
+            vertical=vertical,
+            overflow=overflow,
+            width=width,
+            min_width=min_width,
+            max_width=max_width,
+            ratio=ratio,
+            no_wrap=no_wrap,
+        )
+        self.columns.append(column)
+
+    def add_row(
+        self,
+        *renderables: Optional["RenderableType"],
+        style: Optional[StyleType] = None,
+        end_section: bool = False,
+    ) -> None:
+        """Add a row of renderables.
+
+        Args:
+            *renderables (None or renderable): Each cell in a row must be a renderable object (including str),
+                or ``None`` for a blank cell.
+            style (StyleType, optional): An optional style to apply to the entire row. Defaults to None.
+            end_section (bool, optional): End a section and draw a line. Defaults to False.
+
+        Raises:
+            errors.NotRenderableError: If you add something that can't be rendered.
+        """
+
+        def add_cell(column: Column, renderable: "RenderableType") -> None:
+            column._cells.append(renderable)
+
+        cell_renderables: List[Optional["RenderableType"]] = list(renderables)
+
+        columns = self.columns
+        if len(cell_renderables) < len(columns):
+            cell_renderables = [
+                *cell_renderables,
+                *[None] * (len(columns) - len(cell_renderables)),
+            ]
+        for index, renderable in enumerate(cell_renderables):
+            if index == len(columns):
+                column = Column(_index=index)
+                for _ in self.rows:
+                    add_cell(column, Text(""))
+                self.columns.append(column)
+            else:
+                column = columns[index]
+            if renderable is None:
+                add_cell(column, "")
+            elif is_renderable(renderable):
+                add_cell(column, renderable)
+            else:
+                raise errors.NotRenderableError(
+                    f"unable to render {type(renderable).__name__}; a string or other renderable object is required"
+                )
+        self.rows.append(Row(style=style, end_section=end_section))
+
+    def add_section(self) -> None:
+        """Add a new section (draw a line after current row)."""
+
+        if self.rows:
+            self.rows[-1].end_section = True
+
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        if not self.columns:
+            yield Segment("\n")
+            return
+
+        max_width = options.max_width
+        if self.width is not None:
+            max_width = self.width
+
+        extra_width = self._extra_width
+        widths = self._calculate_column_widths(
+            console, options.update_width(max_width - extra_width)
+        )
+        table_width = sum(widths) + extra_width
+
+        render_options = options.update(
+            width=table_width, highlight=self.highlight, height=None
+        )
+
+        def render_annotation(
+            text: TextType, style: StyleType, justify: "JustifyMethod" = "center"
+        ) -> "RenderResult":
+            render_text = (
+                console.render_str(text, style=style, highlight=False)
+                if isinstance(text, str)
+                else text
+            )
+            return console.render(
+                render_text, options=render_options.update(justify=justify)
+            )
+
+        if self.title:
+            yield from render_annotation(
+                self.title,
+                style=Style.pick_first(self.title_style, "table.title"),
+                justify=self.title_justify,
+            )
+        yield from self._render(console, render_options, widths)
+        if self.caption:
+            yield from render_annotation(
+                self.caption,
+                style=Style.pick_first(self.caption_style, "table.caption"),
+                justify=self.caption_justify,
+            )
+
+    def _calculate_column_widths(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> List[int]:
+        """Calculate the widths of each column, including padding, not including borders."""
+        max_width = options.max_width
+        columns = self.columns
+        width_ranges = [
+            self._measure_column(console, options, column) for column in columns
+        ]
+        widths = [_range.maximum or 1 for _range in width_ranges]
+        get_padding_width = self._get_padding_width
+        extra_width = self._extra_width
+        if self.expand:
+            ratios = [col.ratio or 0 for col in columns if col.flexible]
+            if any(ratios):
+                fixed_widths = [
+                    0 if column.flexible else _range.maximum
+                    for _range, column in zip(width_ranges, columns)
+                ]
+                flex_minimum = [
+                    (column.width or 1) + get_padding_width(column._index)
+                    for column in columns
+                    if column.flexible
+                ]
+                flexible_width = max_width - sum(fixed_widths)
+                flex_widths = ratio_distribute(flexible_width, ratios, flex_minimum)
+                iter_flex_widths = iter(flex_widths)
+                for index, column in enumerate(columns):
+                    if column.flexible:
+                        widths[index] = fixed_widths[index] + next(iter_flex_widths)
+        table_width = sum(widths)
+
+        if table_width > max_width:
+            widths = self._collapse_widths(
+                widths,
+                [(column.width is None and not column.no_wrap) for column in columns],
+                max_width,
+            )
+            table_width = sum(widths)
+            # last resort, reduce columns evenly
+            if table_width > max_width:
+                excess_width = table_width - max_width
+                widths = ratio_reduce(excess_width, [1] * len(widths), widths, widths)
+                table_width = sum(widths)
+
+            width_ranges = [
+                self._measure_column(console, options.update_width(width), column)
+                for width, column in zip(widths, columns)
+            ]
+            widths = [_range.maximum or 0 for _range in width_ranges]
+
+        if (table_width < max_width and self.expand) or (
+            self.min_width is not None and table_width < (self.min_width - extra_width)
+        ):
+            _max_width = (
+                max_width
+                if self.min_width is None
+                else min(self.min_width - extra_width, max_width)
+            )
+            pad_widths = ratio_distribute(_max_width - table_width, widths)
+            widths = [_width + pad for _width, pad in zip(widths, pad_widths)]
+
+        return widths
+
+    @classmethod
+    def _collapse_widths(
+        cls, widths: List[int], wrapable: List[bool], max_width: int
+    ) -> List[int]:
+        """Reduce widths so that the total is under max_width.
+
+        Args:
+            widths (List[int]): List of widths.
+            wrapable (List[bool]): List of booleans that indicate if a column may shrink.
+            max_width (int): Maximum width to reduce to.
+
+        Returns:
+            List[int]: A new list of widths.
+        """
+        total_width = sum(widths)
+        excess_width = total_width - max_width
+        if any(wrapable):
+            while total_width and excess_width > 0:
+                max_column = max(
+                    width for width, allow_wrap in zip(widths, wrapable) if allow_wrap
+                )
+                second_max_column = max(
+                    width if allow_wrap and width != max_column else 0
+                    for width, allow_wrap in zip(widths, wrapable)
+                )
+                column_difference = max_column - second_max_column
+                ratios = [
+                    (1 if (width == max_column and allow_wrap) else 0)
+                    for width, allow_wrap in zip(widths, wrapable)
+                ]
+                if not any(ratios) or not column_difference:
+                    break
+                max_reduce = [min(excess_width, column_difference)] * len(widths)
+                widths = ratio_reduce(excess_width, ratios, max_reduce, widths)
+
+                total_width = sum(widths)
+                excess_width = total_width - max_width
+        return widths
+
+    def _get_cells(
+        self, console: "Console", column_index: int, column: Column
+    ) -> Iterable[_Cell]:
+        """Get all the cells with padding and optional header."""
+
+        collapse_padding = self.collapse_padding
+        pad_edge = self.pad_edge
+        padding = self.padding
+        any_padding = any(padding)
+
+        first_column = column_index == 0
+        last_column = column_index == len(self.columns) - 1
+
+        _padding_cache: Dict[Tuple[bool, bool], Tuple[int, int, int, int]] = {}
+
+        def get_padding(first_row: bool, last_row: bool) -> Tuple[int, int, int, int]:
+            cached = _padding_cache.get((first_row, last_row))
+            if cached:
+                return cached
+            top, right, bottom, left = padding
+
+            if collapse_padding:
+                if not first_column:
+                    left = max(0, left - right)
+                if not last_row:
+                    bottom = max(0, top - bottom)
+
+            if not pad_edge:
+                if first_column:
+                    left = 0
+                if last_column:
+                    right = 0
+                if first_row:
+                    top = 0
+                if last_row:
+                    bottom = 0
+            _padding = (top, right, bottom, left)
+            _padding_cache[(first_row, last_row)] = _padding
+            return _padding
+
+        raw_cells: List[Tuple[StyleType, "RenderableType"]] = []
+        _append = raw_cells.append
+        get_style = console.get_style
+        if self.show_header:
+            header_style = get_style(self.header_style or "") + get_style(
+                column.header_style
+            )
+            _append((header_style, column.header))
+        cell_style = get_style(column.style or "")
+        for cell in column.cells:
+            _append((cell_style, cell))
+        if self.show_footer:
+            footer_style = get_style(self.footer_style or "") + get_style(
+                column.footer_style
+            )
+            _append((footer_style, column.footer))
+
+        if any_padding:
+            _Padding = Padding
+            for first, last, (style, renderable) in loop_first_last(raw_cells):
+                yield _Cell(
+                    style,
+                    _Padding(renderable, get_padding(first, last)),
+                    getattr(renderable, "vertical", None) or column.vertical,
+                )
+        else:
+            for style, renderable in raw_cells:
+                yield _Cell(
+                    style,
+                    renderable,
+                    getattr(renderable, "vertical", None) or column.vertical,
+                )
+
+    def _get_padding_width(self, column_index: int) -> int:
+        """Get extra width from padding."""
+        _, pad_right, _, pad_left = self.padding
+        if self.collapse_padding:
+            if column_index > 0:
+                pad_left = max(0, pad_left - pad_right)
+        return pad_left + pad_right
+
+    def _measure_column(
+        self,
+        console: "Console",
+        options: "ConsoleOptions",
+        column: Column,
+    ) -> Measurement:
+        """Get the minimum and maximum width of the column."""
+
+        max_width = options.max_width
+        if max_width < 1:
+            return Measurement(0, 0)
+
+        padding_width = self._get_padding_width(column._index)
+
+        if column.width is not None:
+            # Fixed width column
+            return Measurement(
+                column.width + padding_width, column.width + padding_width
+            ).with_maximum(max_width)
+        # Flexible column, we need to measure contents
+        min_widths: List[int] = []
+        max_widths: List[int] = []
+        append_min = min_widths.append
+        append_max = max_widths.append
+        get_render_width = Measurement.get
+        for cell in self._get_cells(console, column._index, column):
+            _min, _max = get_render_width(console, options, cell.renderable)
+            append_min(_min)
+            append_max(_max)
+
+        measurement = Measurement(
+            max(min_widths) if min_widths else 1,
+            max(max_widths) if max_widths else max_width,
+        ).with_maximum(max_width)
+        measurement = measurement.clamp(
+            None if column.min_width is None else column.min_width + padding_width,
+            None if column.max_width is None else column.max_width + padding_width,
+        )
+        return measurement
+
+    def _render(
+        self, console: "Console", options: "ConsoleOptions", widths: List[int]
+    ) -> "RenderResult":
+        table_style = console.get_style(self.style or "")
+
+        border_style = table_style + console.get_style(self.border_style or "")
+        _column_cells = (
+            self._get_cells(console, column_index, column)
+            for column_index, column in enumerate(self.columns)
+        )
+        row_cells: List[Tuple[_Cell, ...]] = list(zip(*_column_cells))
+        _box = (
+            self.box.substitute(
+                options, safe=pick_bool(self.safe_box, console.safe_box)
+            )
+            if self.box
+            else None
+        )
+        _box = _box.get_plain_headed_box() if _box and not self.show_header else _box
+
+        new_line = Segment.line()
+
+        columns = self.columns
+        show_header = self.show_header
+        show_footer = self.show_footer
+        show_edge = self.show_edge
+        show_lines = self.show_lines
+        leading = self.leading
+
+        _Segment = Segment
+        if _box:
+            box_segments = [
+                (
+                    _Segment(_box.head_left, border_style),
+                    _Segment(_box.head_right, border_style),
+                    _Segment(_box.head_vertical, border_style),
+                ),
+                (
+                    _Segment(_box.foot_left, border_style),
+                    _Segment(_box.foot_right, border_style),
+                    _Segment(_box.foot_vertical, border_style),
+                ),
+                (
+                    _Segment(_box.mid_left, border_style),
+                    _Segment(_box.mid_right, border_style),
+                    _Segment(_box.mid_vertical, border_style),
+                ),
+            ]
+            if show_edge:
+                yield _Segment(_box.get_top(widths), border_style)
+                yield new_line
+        else:
+            box_segments = []
+
+        get_row_style = self.get_row_style
+        get_style = console.get_style
+
+        for index, (first, last, row_cell) in enumerate(loop_first_last(row_cells)):
+            header_row = first and show_header
+            footer_row = last and show_footer
+            row = (
+                self.rows[index - show_header]
+                if (not header_row and not footer_row)
+                else None
+            )
+            max_height = 1
+            cells: List[List[List[Segment]]] = []
+            if header_row or footer_row:
+                row_style = Style.null()
+            else:
+                row_style = get_style(
+                    get_row_style(console, index - 1 if show_header else index)
+                )
+            for width, cell, column in zip(widths, row_cell, columns):
+                render_options = options.update(
+                    width=width,
+                    justify=column.justify,
+                    no_wrap=column.no_wrap,
+                    overflow=column.overflow,
+                    height=None,
+                )
+                lines = console.render_lines(
+                    cell.renderable,
+                    render_options,
+                    style=get_style(cell.style) + row_style,
+                )
+                max_height = max(max_height, len(lines))
+                cells.append(lines)
+
+            row_height = max(len(cell) for cell in cells)
+
+            def align_cell(
+                cell: List[List[Segment]],
+                vertical: "VerticalAlignMethod",
+                width: int,
+                style: Style,
+            ) -> List[List[Segment]]:
+                if header_row:
+                    vertical = "bottom"
+                elif footer_row:
+                    vertical = "top"
+
+                if vertical == "top":
+                    return _Segment.align_top(cell, width, row_height, style)
+                elif vertical == "middle":
+                    return _Segment.align_middle(cell, width, row_height, style)
+                return _Segment.align_bottom(cell, width, row_height, style)
+
+            cells[:] = [
+                _Segment.set_shape(
+                    align_cell(
+                        cell,
+                        _cell.vertical,
+                        width,
+                        get_style(_cell.style) + row_style,
+                    ),
+                    width,
+                    max_height,
+                )
+                for width, _cell, cell, column in zip(widths, row_cell, cells, columns)
+            ]
+
+            if _box:
+                if last and show_footer:
+                    yield _Segment(
+                        _box.get_row(widths, "foot", edge=show_edge), border_style
+                    )
+                    yield new_line
+                left, right, _divider = box_segments[0 if first else (2 if last else 1)]
+
+                # If the column divider is whitespace also style it with the row background
+                divider = (
+                    _divider
+                    if _divider.text.strip()
+                    else _Segment(
+                        _divider.text, row_style.background_style + _divider.style
+                    )
+                )
+                for line_no in range(max_height):
+                    if show_edge:
+                        yield left
+                    for last_cell, rendered_cell in loop_last(cells):
+                        yield from rendered_cell[line_no]
+                        if not last_cell:
+                            yield divider
+                    if show_edge:
+                        yield right
+                    yield new_line
+            else:
+                for line_no in range(max_height):
+                    for rendered_cell in cells:
+                        yield from rendered_cell[line_no]
+                    yield new_line
+            if _box and first and show_header:
+                yield _Segment(
+                    _box.get_row(widths, "head", edge=show_edge), border_style
+                )
+                yield new_line
+            end_section = row and row.end_section
+            if _box and (show_lines or leading or end_section):
+                if (
+                    not last
+                    and not (show_footer and index >= len(row_cells) - 2)
+                    and not (show_header and header_row)
+                ):
+                    if leading:
+                        yield _Segment(
+                            _box.get_row(widths, "mid", edge=show_edge) * leading,
+                            border_style,
+                        )
+                    else:
+                        yield _Segment(
+                            _box.get_row(widths, "row", edge=show_edge), border_style
+                        )
+                    yield new_line
+
+        if _box and show_edge:
+            yield _Segment(_box.get_bottom(widths), border_style)
+            yield new_line
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Console
+    from pip._vendor.rich.highlighter import ReprHighlighter
+    from pip._vendor.rich.table import Table as Table
+
+    from ._timer import timer
+
+    with timer("Table render"):
+        table = Table(
+            title="Star Wars Movies",
+            caption="Rich example table",
+            caption_justify="right",
+        )
+
+        table.add_column(
+            "Released", header_style="bright_cyan", style="cyan", no_wrap=True
+        )
+        table.add_column("Title", style="magenta")
+        table.add_column("Box Office", justify="right", style="green")
+
+        table.add_row(
+            "Dec 20, 2019",
+            "Star Wars: The Rise of Skywalker",
+            "$952,110,690",
+        )
+        table.add_row("May 25, 2018", "Solo: A Star Wars Story", "$393,151,347")
+        table.add_row(
+            "Dec 15, 2017",
+            "Star Wars Ep. V111: The Last Jedi",
+            "$1,332,539,889",
+            style="on black",
+            end_section=True,
+        )
+        table.add_row(
+            "Dec 16, 2016",
+            "Rogue One: A Star Wars Story",
+            "$1,332,439,889",
+        )
+
+        def header(text: str) -> None:
+            console.print()
+            console.rule(highlight(text))
+            console.print()
+
+        console = Console()
+        highlight = ReprHighlighter()
+        header("Example Table")
+        console.print(table, justify="center")
+
+        table.expand = True
+        header("expand=True")
+        console.print(table)
+
+        table.width = 50
+        header("width=50")
+
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        header("row_styles=['dim', 'none']")
+
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        table.leading = 1
+        header("leading=1, row_styles=['dim', 'none']")
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        table.show_lines = True
+        table.leading = 0
+        header("show_lines=True, row_styles=['dim', 'none']")
+        console.print(table, justify="center")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py
new file mode 100644
index 0000000000000000000000000000000000000000..471dfb2f9271c073f0713ca98f8db2f89c975071
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py
@@ -0,0 +1,115 @@
+import configparser
+from typing import Dict, List, IO, Mapping, Optional
+
+from .default_styles import DEFAULT_STYLES
+from .style import Style, StyleType
+
+
+class Theme:
+    """A container for style information, used by :class:`~rich.console.Console`.
+
+    Args:
+        styles (Dict[str, Style], optional): A mapping of style names on to styles. Defaults to None for a theme with no styles.
+        inherit (bool, optional): Inherit default styles. Defaults to True.
+    """
+
+    styles: Dict[str, Style]
+
+    def __init__(
+        self, styles: Optional[Mapping[str, StyleType]] = None, inherit: bool = True
+    ):
+        self.styles = DEFAULT_STYLES.copy() if inherit else {}
+        if styles is not None:
+            self.styles.update(
+                {
+                    name: style if isinstance(style, Style) else Style.parse(style)
+                    for name, style in styles.items()
+                }
+            )
+
+    @property
+    def config(self) -> str:
+        """Get contents of a config file for this theme."""
+        config = "[styles]\n" + "\n".join(
+            f"{name} = {style}" for name, style in sorted(self.styles.items())
+        )
+        return config
+
+    @classmethod
+    def from_file(
+        cls, config_file: IO[str], source: Optional[str] = None, inherit: bool = True
+    ) -> "Theme":
+        """Load a theme from a text mode file.
+
+        Args:
+            config_file (IO[str]): An open conf file.
+            source (str, optional): The filename of the open file. Defaults to None.
+            inherit (bool, optional): Inherit default styles. Defaults to True.
+
+        Returns:
+            Theme: A New theme instance.
+        """
+        config = configparser.ConfigParser()
+        config.read_file(config_file, source=source)
+        styles = {name: Style.parse(value) for name, value in config.items("styles")}
+        theme = Theme(styles, inherit=inherit)
+        return theme
+
+    @classmethod
+    def read(
+        cls, path: str, inherit: bool = True, encoding: Optional[str] = None
+    ) -> "Theme":
+        """Read a theme from a path.
+
+        Args:
+            path (str): Path to a config file readable by Python configparser module.
+            inherit (bool, optional): Inherit default styles. Defaults to True.
+            encoding (str, optional): Encoding of the config file. Defaults to None.
+
+        Returns:
+            Theme: A new theme instance.
+        """
+        with open(path, "rt", encoding=encoding) as config_file:
+            return cls.from_file(config_file, source=path, inherit=inherit)
+
+
+class ThemeStackError(Exception):
+    """Base exception for errors related to the theme stack."""
+
+
+class ThemeStack:
+    """A stack of themes.
+
+    Args:
+        theme (Theme): A theme instance
+    """
+
+    def __init__(self, theme: Theme) -> None:
+        self._entries: List[Dict[str, Style]] = [theme.styles]
+        self.get = self._entries[-1].get
+
+    def push_theme(self, theme: Theme, inherit: bool = True) -> None:
+        """Push a theme on the top of the stack.
+
+        Args:
+            theme (Theme): A Theme instance.
+            inherit (boolean, optional): Inherit styles from current top of stack.
+        """
+        styles: Dict[str, Style]
+        styles = (
+            {**self._entries[-1], **theme.styles} if inherit else theme.styles.copy()
+        )
+        self._entries.append(styles)
+        self.get = self._entries[-1].get
+
+    def pop_theme(self) -> None:
+        """Pop (and discard) the top-most theme."""
+        if len(self._entries) == 1:
+            raise ThemeStackError("Unable to pop base theme")
+        self._entries.pop()
+        self.get = self._entries[-1].get
+
+
+if __name__ == "__main__":  # pragma: no cover
+    theme = Theme()
+    print(theme.config)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e468bf8cebda1f507a30ff65584c13f6ba59d389
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py
@@ -0,0 +1,36 @@
+"""Verify certificates using native system trust stores"""
+
+import sys as _sys
+
+if _sys.version_info < (3, 10):
+    raise ImportError("truststore requires Python 3.10 or later")
+
+# Detect Python runtimes which don't implement SSLObject.get_unverified_chain() API
+# This API only became public in Python 3.13 but was available in CPython and PyPy since 3.10.
+if _sys.version_info < (3, 13):
+    try:
+        import ssl as _ssl
+    except ImportError:
+        raise ImportError("truststore requires the 'ssl' module")
+    else:
+        _sslmem = _ssl.MemoryBIO()
+        _sslobj = _ssl.create_default_context().wrap_bio(
+            _sslmem,
+            _sslmem,
+        )
+        try:
+            while not hasattr(_sslobj, "get_unverified_chain"):
+                _sslobj = _sslobj._sslobj  # type: ignore[attr-defined]
+        except AttributeError:
+            raise ImportError(
+                "truststore requires peer certificate chain APIs to be available"
+            ) from None
+
+        del _ssl, _sslobj, _sslmem  # noqa: F821
+
+from ._api import SSLContext, extract_from_ssl, inject_into_ssl  # noqa: E402
+
+del _api, _sys  # type: ignore[name-defined] # noqa: F821
+
+__all__ = ["SSLContext", "inject_into_ssl", "extract_from_ssl"]
+__version__ = "0.10.0"
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b57d409718ab5277e8bfb470cec40461727c906
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..d25a759988d4b3e81308aaf122df32d333f2a964
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py
@@ -0,0 +1,879 @@
+from __future__ import annotations
+
+import contextlib
+
+import dataclasses
+import enum
+import functools
+import logging
+import threading
+import traceback
+import unittest.mock
+import weakref
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    TypeVar,
+)
+
+import torch
+from torch.utils import _pytree as pytree
+from torch.utils._traceback import CapturedTraceback
+from torch.utils.weak import WeakTensorKeyDictionary
+
+log = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+
+    import sympy
+
+
+"""
+torch._guards is the definitional source of truth for general purpose guard structures.
+
+An important thing to keep in mind here is the preservation of layering. There should be no dynamo notions,
+and no guard installation notions here.
+"""
+
+
+class CompileId(NamedTuple):
+    frame_id: int
+    # This id is per-frame, and counts how many times we've compiled this
+    # frame.  This could have been a global id but having this be per-frame
+    # gives you a better intuitive sense for how many recompiles have occurred
+    # so far.
+    frame_compile_id: int
+    # TODO: consider also tracking the recompilation count
+
+    def __str__(self):
+        return f"{self.frame_id}/{self.frame_compile_id}"
+
+
+class TraceId(NamedTuple):
+    compile_id: CompileId
+    # This starts off as 0, and every time we restart analysis it goes
+    # up by one
+    attempt: int
+
+    def __str__(self):
+        if self.attempt == 0:
+            return str(self.compile_id)
+        else:
+            return f"{self.compile_id}_{self.attempt}"
+
+
+class GuardSource(enum.Enum):
+    LOCAL = 0
+    GLOBAL = 1
+    LOCAL_NN_MODULE = 2
+    GLOBAL_NN_MODULE = 3
+    CONSTANT = 4
+    RANDOM_VALUE = 5
+    SHAPE_ENV = 6
+    LOCAL_FSDP_MODULE = 7
+    GLOBAL_FSDP_MODULE = 8
+    BACKWARD_STATE = 9
+    EPHEMERAL = 10
+    SYNTHETIC_LOCAL = 11
+
+    def is_fsdp_module(self) -> bool:
+        return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
+
+    def is_nn_module(self) -> bool:
+        return (
+            self
+            in (
+                GuardSource.GLOBAL_NN_MODULE,
+                GuardSource.LOCAL_NN_MODULE,
+            )
+            or self.is_fsdp_module()
+        )
+
+    def is_local(self):
+        return self in (
+            GuardSource.LOCAL,
+            GuardSource.LOCAL_NN_MODULE,
+            GuardSource.LOCAL_FSDP_MODULE,
+        )
+
+
+"""
+Base class for a "GuardBuilder" role.
+
+The GuardBuilderBase role is to represent a scope within which to build a guard. The name is a little
+confusing, as its not a builder, but for the sake of avoiding a lot of renames and keeping the original reference
+to torchdynamo's GuardBuilder.
+
+Note: create_fn is invoked with a GuardBuilderBase and a Guard. A GuardBuilder is chosen based
+on GuardSource's select function.
+
+There is value in keeping this GuardBuilderBase empty to keep layering clean.
+"""
+
+
+class GuardBuilderBase:
+    pass
+
+
+class ShapeGuard(NamedTuple):
+    expr: sympy.Expr
+    stack: CapturedTraceback
+
+
+@dataclasses.dataclass
+class Guard:
+    # originating_source is the source that called the make_guard method to
+    # construct this guard object. The property name specifies what exactly it
+    # is the guard is guarding on.  The meaning of the name is dependent on the
+    # create_fn; you must look at the use-site inside create_fn to know what
+    # name means.
+    #
+    # That being said, although you might think this is just a "name", name is
+    # usually an arbitrary Python expression that will be evaluated with all
+    # globals (and locals, if you create a LOCAL guard) to extract the Python
+    # object that we want to perform guard tests on.  This evaluation
+    # typically happens in GuardBuilder.eval.  In these cases, name is
+    # typically produced by originating_source.name() (not to be confused with
+    # GuardSource - the property source).
+    #
+    # Occasionally, name is not a valid Python expression; sometimes
+    # it is meaningless.  Example create_fns that are like this include
+    # GRAD_MODE and SHAPE_ENV.
+    originating_source: Source
+    create_fn: Callable[[GuardBuilderBase, Guard], None]
+
+    # Export only. These values are written to at time of guard check_fn creation.
+    guard_types: Optional[List[str]] = None
+    code_list: Optional[List[str]] = None
+    obj_weakref: Optional[object] = None
+    guarded_class_weakref: Optional[type] = None
+
+    stack: Optional[CapturedTraceback] = None
+    user_stack: Optional[traceback.StackSummary] = None
+    _hash: Optional[int] = None
+
+    def __hash__(self):
+        if self._hash is None:
+            self._hash = hash((self.name, self.source, id(self.create_fn)))
+        return self._hash
+
+    def sort_key(self):
+        return (
+            self.source.value if self.source else -1,
+            len(self.name),
+            self.name,
+            self.inner_create_fn().__code__.co_firstlineno,
+        )
+
+    def __lt__(self, other):
+        return self.sort_key() < other.sort_key()
+
+    def inner_create_fn(self):
+        if isinstance(self.create_fn, functools.partial):
+            return self.create_fn.func
+        else:
+            return self.create_fn
+
+    @property
+    def name(self) -> str:
+        return self.originating_source.name()
+
+    @property
+    def source(self) -> GuardSource:
+        return self.originating_source.guard_source()
+
+    @staticmethod
+    def weakref_to_str(obj_weakref):
+        """
+        This is a workaround of a Python weakref bug.
+
+        `obj_weakref` is instance returned by `weakref.ref`,
+        `str(obj_weakref)` is buggy if the original obj overrides __getattr__, e.g:
+
+            class MyConfig(dict):
+                def __getattr__(self, x):
+                    return self[x]
+
+            obj = MyConfig(offset=5)
+            obj_weakref = weakref.ref(obj)
+            str(obj_weakref)  # raise error: KeyError: '__name__'
+        """
+        if isinstance(obj_weakref, weakref.ReferenceType):
+            obj = obj_weakref()
+            if obj is not None:
+                return f"<weakref at {hex(id(obj_weakref))}; to '{obj.__class__.__name__}' at {hex(id(obj))}>"
+            else:
+                return f"<weakref at {hex(id(obj_weakref))}; dead>"
+        else:
+            return str(obj_weakref)
+
+    def __repr__(self):
+        s = f"""
+        {self.source.name.lower() if self.source else ""} {repr(self.name)} {self.inner_create_fn().__name__}
+        {{
+            'guard_types': {self.guard_types},
+            'code': {self.code_list},
+            'obj_weakref': {self.weakref_to_str(self.obj_weakref)}
+            'guarded_class': {self.guarded_class_weakref}
+        }}
+        """
+        return s
+
+    def __str__(self):
+        output = f"Name: {repr(self.name)}\n"
+        source = self.source.name.lower() if self.source else ""
+        output += f"    Source: {source}\n"
+        output += f"    Create Function: {self.inner_create_fn().__name__}\n"
+        output += f"    Guard Types: {self.guard_types}\n"
+        output += f"    Code List: {self.code_list}\n"
+        output += f"    Object Weakref: {self.weakref_to_str(self.obj_weakref)}\n"
+        output += f"    Guarded Class Weakref: {self.guarded_class_weakref}\n"
+        return output
+
+    def create(self, builder: GuardBuilderBase):
+        try:
+            return self.create_fn(builder, self)
+        except Exception:
+            log.error("Error while creating guard:\n%s", str(self).rstrip())
+            if self.stack:
+                log.error("Created at:\n%s", "".join(self.stack.format()[-4:]).rstrip())
+            raise
+
+    def is_nn_module(self):
+        return self.source.is_nn_module()
+
+    def is_fsdp_module(self):
+        return self.source.is_fsdp_module()
+
+    def is_local(self):
+        return self.source.is_local()
+
+    def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
+        if not self.guard_types:
+            self.guard_types = list()
+
+        self.guard_types.append(guard_type)
+
+        assert self.guarded_class_weakref in (
+            guarded_class,
+            None,
+        ), "Guarded class id must be identical, or None"
+        self.guarded_class_weakref = guarded_class
+
+        if not self.code_list:
+            self.code_list = code_list
+        else:
+            self.code_list.extend(code_list)
+
+        assert self.obj_weakref in (
+            obj_weakref,
+            None,
+        ), "Guarded object must be identical, or None"
+        self.obj_weakref = obj_weakref
+
+
+T = TypeVar("T")
+
+"""
+Parent structure for guard env expressions.
+A GuardEnvExpr can have any subtype.
+Note: All subtypes must be handled exhaustively in
+torch._dynamo.guards._parse_guard_env_guards to avoid a RuntimeError.
+"""
+
+
+@dataclasses.dataclass
+class GuardEnvExpr:
+    pass
+
+
+"""
+A class representing a pair of duplicate inputs.
+input_pos_a and input_pos_b are input positions we have deduped.
+"""
+
+
+@dataclasses.dataclass
+class DuplicateInputs(GuardEnvExpr):
+    input_source_a: Source
+    input_source_b: Source
+
+    def __post_init__(self):
+        assert self.input_source_a != self.input_source_b
+
+
+"""
+Checkpointable is an interface for driving state snapshotting, left purposely vague for now.
+
+copy_graphstate() -> T, a somewhat legacy name, is expected to emit a snapshot of any type that
+can also be taken in at restore_graphstate(T) calls.
+
+When to snapshot, is, at the moment, an implementation detail of upstream callers. Checkpointable
+does not provide any garuantees around consistency, idempotency, or safety of calling its APIs, yet.
+
+In the future, it will have a closer coupling to a generic Checkpoint management system.
+"""
+
+
+class Checkpointable(ABC, Generic[T]):
+    @abstractmethod
+    def copy_graphstate(self) -> T:
+        ...
+
+    @abstractmethod
+    def restore_graphstate(self, state: T):
+        ...
+
+
+class GuardsCheckpointState:
+    """
+    The GuardCheckpointState - it is the T of Checkpointable[T] for GuardsContext
+    """
+
+    dynamo_guards: Set[Guard] = set()
+
+    def __init__(self, dynamo_guards):
+        self.dynamo_guards = dynamo_guards
+
+    def diff(self, other):
+        """
+        Produces a delta against another GuardsCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        Guard type objects.
+        """
+        r = self.dynamo_guards.difference(other.dynamo_guards)
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContextCheckpointState:
+    nn_modules: Dict[str, torch.nn.Module] = {}
+
+    def __init__(self, nn_modules):
+        self.nn_modules = nn_modules
+
+    def diff(self, other):
+        """
+        Produces a delta against another ModuleContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        module key names.
+        """
+        r = set(self.nn_modules.keys()).difference(set(other.nn_modules.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContext(Checkpointable[ModuleContextCheckpointState]):
+    def __init__(self):
+        self.nn_modules: Dict[str, Any] = {}
+
+    def copy_graphstate(self):
+        return ModuleContextCheckpointState(dict(self.nn_modules))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, ModuleContextCheckpointState)
+        self.nn_modules = state.nn_modules
+
+
+class GlobalContextCheckpointState:
+    global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def __init__(self, global_states):
+        self.global_state = global_states
+
+    def diff(self, other):
+        """
+        Produces a delta against another GlobalContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        global key names.
+        """
+        r = set(self.global_state.keys()).difference(set(other.global_state.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
+    """
+    This keeps track of the global torch state during tracing of a function.
+    For example, torch.is_grad_enabled.
+    """
+
+    _supported_global_states = {
+        "grad_enabled",
+        "torch_function_enabled",
+        "autocast_enabled",
+        "autocast_cpu_enabled",
+        "autocast_gpu_dtype",
+        "autocast_cpu_dtype",
+        "autocast_cache_enabled",
+    }
+
+    def __init__(self):
+        self.global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def copy_graphstate(self):
+        return GlobalContextCheckpointState(dict(self.global_state))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, GlobalContextCheckpointState)
+        self.global_state = state.global_state
+        assert (
+            len(self.global_state) == len(self._supported_global_states)
+            and set(self.global_state.keys()) == self._supported_global_states
+        ), "Global state mismatch"
+        for func, args in self.global_state.values():
+            func(args)
+
+
+"""
+A GuardsContext is a checkpointable representation of all the guards in the current tracing
+context. It's lifecycle is bound 1:1 to the tracing context, and it should never be instantiated
+directly outside of it. For passing around internal state representations of this object,
+prefer to extract them with copy_graphstate to produce a GuardsCheckpointState.
+"""
+
+
+# Like a Set[Guard] but will record the user stack on all guards at the
+# time they were installed at their destination
+class GuardsSet:
+    def __init__(self, inner=None):
+        if inner is None:
+            inner = set()
+        self.inner = inner
+
+    def __iter__(self):
+        return iter(self.inner)
+
+    def __len__(self):
+        return len(self.inner)
+
+    # Subtraction along with bool is typically used to determine the delta of
+    # added guards between checkpoints for higher order ops
+    def __sub__(self, other):
+        return GuardsSet(self.inner - other.inner)
+
+    def __bool__(self):
+        return bool(self.inner)
+
+    def add(self, guard: Guard, *, collect_debug_stack=True, skip=0):
+        if guard in self.inner:
+            return
+        if collect_debug_stack:
+            if guard.stack is None:
+                guard.stack = CapturedTraceback.extract(skip=1 + skip)
+            if guard.user_stack is None:
+                guard.user_stack = TracingContext.extract_stack()
+        self.inner.add(guard)
+
+    def update(self, *others: Set[Guard]):
+        for o in others:
+            for g in o:
+                self.add(g, skip=1)
+
+    def remove_guards_with_source(self, source):
+        """Delete all guards with a given source"""
+        self.inner = {g for g in self.inner if g.originating_source != source}
+
+
+class GuardsContext(Checkpointable[GuardsCheckpointState]):
+    def __init__(self):
+        self.dynamo_guards: GuardsSet = GuardsSet()
+        self.aotautograd_guards: List[GuardEnvExpr] = []
+
+    def copy_graphstate(self):
+        return GuardsCheckpointState(set(self.dynamo_guards.inner))
+
+    def restore_graphstate(self, state):
+        # NB: "steals" the passed in state
+        assert isinstance(state, GuardsCheckpointState)
+        self.dynamo_guards = GuardsSet(state.dynamo_guards)
+
+
+_TLS = threading.local()
+
+"""
+TracingContext is the source of truth for all currently accumulated information
+needed to trace. Its lifecycle is kept 1:1 when using TorchDynamo, but other systems
+are open to managing their own TracingContext with that in mind.
+
+The purpose of TracingContext is not to be a dumping ground, or god object, but rather to avoid
+having to plumb complex subsystems across multiple verticals.
+
+Ex: A common example is guard accumulation between dynamo, shape_env, aot_autograd, and inductor.
+Accessing the current tracing context via
+TracingContext.get() allows users to accumulate their own guards for processing, without needing to know how
+to plumb objects back up to where frame interpretation happened.
+
+Note that you can end up with multiple TracingContext for a single compilation
+of a frame, as we reset the TracingContext whenever we restart analysis.
+CompileContext is a more overarching context that encompasses multiple restarts.
+"""
+
+
+class CompileContext:
+    @staticmethod
+    def get() -> CompileContext:
+        assert _TLS.compile_context is not None
+        return _TLS.compile_context
+
+    @staticmethod
+    def try_get() -> Optional[CompileContext]:
+        return getattr(_TLS, "compile_context", None)
+
+    def __init__(self, compile_id):
+        assert compile_id is None or isinstance(compile_id, CompileId)
+        self.compile_id: Optional[CompileId] = compile_id
+        self.attempt = 0
+
+    @staticmethod
+    def current_compile_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        return self.compile_id
+
+    @staticmethod
+    def current_trace_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        if self.compile_id is None:
+            return None
+        return TraceId(self.compile_id, self.attempt)
+
+
+class TracingContext:
+    """
+    Provides the currently installed TracingContext, or None.
+
+    Note that it is a staticmethod, and invocations outside of `with tracing()` (see below), are valid but
+    will return None.
+    """
+
+    @staticmethod
+    def try_get() -> Optional[TracingContext]:
+        return getattr(_TLS, "tracing_context", None)
+
+    @staticmethod
+    def get() -> TracingContext:
+        if ctx := TracingContext.try_get():
+            return ctx
+        raise RuntimeError(
+            "TracingContext.get() must be called within an ongoing trace."
+        )
+
+    def __init__(self, fake_mode):
+        self.guards_context = GuardsContext()
+        self.module_context = ModuleContext()
+        self.global_context = GlobalContext()
+        self.fake_mode = fake_mode
+        self.frame_summary_stack = []
+        # This is morally part of frame_summary_stack, but it is kept separate
+        # for clarity.  As we process a frame, this variable gets updated
+        # to keep track of what line we are in the function.  We make a
+        # function call, this gets cleared and the frame location is pushed
+        # to frame_summary_stack (prepping this variable for the inner frame's
+        # progress)
+        self.loc_in_frame = None
+        # this is only set after aot_autograd
+        self.fw_metadata = None
+        self.params_flat = None
+        # this is for extended return calling convention from backend
+        # compiler to aot_autograd
+        # Per output, what the compiler specified stride of the output is,
+        # or None if no stride is known.  This is always the HINT, it
+        # is never a SymInt (it would be better if it was a SymInt, but
+        # I can't conveniently get this from Inductor atm.  Also, be
+        # careful not to accidentally induce guards on the SymInt if
+        # you ever do change this in aot_autograd.py; you should check
+        # on permutations preferentially.)
+        self.output_strides: Optional[List[Optional[List[int]]]] = None
+        # When this is True, whenever we encounter an int in Dynamo tracing,
+        # we will (1) force unspec it and (2) force it as a size-like unbacked
+        # integer.  This is currently used when processing certain lists of
+        # ints that are known to be size-like and may have 0/1 entries that we
+        # must not specialize on.
+        self.force_unspec_int_unbacked_size_like = False
+        # See note [Tensor Fakification and Symbol Caching]
+        self.tensor_to_context = WeakTensorKeyDictionary()
+
+        # If this true, Aot Autograd will return output Fake Tensors with appropiate
+        # meta on the first invocation
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        self.fakify_first_call = False
+
+    def clear(self):
+        # Look at the note in output_graph.py in function `save_global_state`
+        # for the context on clearing global context.
+        self.global_context.global_state = {}
+
+    @staticmethod
+    @contextmanager
+    def patch(**kwargs):
+        prior = {}
+        ctx = TracingContext.get()
+
+        for key in kwargs.keys():
+            # KeyError on invalid entry
+            prior[key] = getattr(ctx, key)
+        for key, val in kwargs.items():
+            setattr(ctx, key, val)
+        try:
+            yield
+        finally:
+            for key, val in prior.items():
+                setattr(ctx, key, val)
+
+    @staticmethod
+    def extract_stack():
+        self = TracingContext.try_get()
+        if self is None:
+            return traceback.StackSummary()
+        stack = self.frame_summary_stack
+        if self.loc_in_frame is not None:
+            stack = stack + [self.loc_in_frame]
+        return traceback.StackSummary.from_list(stack)
+
+    # Call this when you want to call into some code that isn't necessarily
+    # associated with the current frame state
+    @staticmethod
+    @contextlib.contextmanager
+    def clear_frame():
+        tc = TracingContext.get()
+        with unittest.mock.patch.object(
+            tc, "frame_summary_stack", []
+        ), unittest.mock.patch.object(tc, "loc_in_frame", None):
+            try:
+                yield
+            except Exception as e:
+                # Prevent real_stack from getting attached
+                #
+                # The invariant is that if an Exception as real_stack, we've
+                # appropriately attached a user stack and we no longer need to
+                # attach anything. Because we cannot conveniently interpose
+                # when an exception is thrown, we instead interpose everywhere
+                # we set what the user stack is set (using the context
+                # manager). However, our compiler stack does "tail calls"
+                # (when it calls into user compiler), at which point the
+                # parent exception frames would incorrectly attach an
+                # incorrect frame.
+                #
+                # However, if, somehow, someone raised an exception with this
+                # scope that had a stack (for example, because they are
+                # restoring the user stack state appropriately as they process
+                # node by node), we should respect it. Thus, we cannot
+                # unconditionally set None.
+                if not hasattr(e, "real_stack"):
+                    e.real_stack = None  # type: ignore[attr-defined]
+                raise
+
+    @staticmethod
+    @contextlib.contextmanager
+    def current_frame(frame_summary):
+        # frame_summary can be None to solely take advantage of real_stack
+        # attachment to thrown exceptions
+        tc = TracingContext.get()
+        if frame_summary is not None:
+            tc.frame_summary_stack.append(frame_summary)
+        old = tc.loc_in_frame
+        tc.loc_in_frame = None
+        try:
+            yield
+        except Exception as e:
+            if not hasattr(e, "real_stack"):
+                e.real_stack = tc.extract_stack()  # type: ignore[attr-defined]
+            raise
+        finally:
+            if frame_summary is not None:
+                tc.frame_summary_stack.pop()
+            tc.loc_in_frame = old
+
+    @staticmethod
+    @contextlib.contextmanager
+    def report_output_strides():
+        tc = TracingContext.try_get()
+        if tc is None:
+            yield None
+            return
+        old_output_strides = tc.output_strides
+        tc.output_strides = []
+        try:
+            yield tc.output_strides
+        finally:
+            tc.output_strides = old_output_strides
+
+    @staticmethod
+    def set_current_loc(filename, lineno, frame_name):
+        TracingContext.get().loc_in_frame = traceback.FrameSummary(
+            filename, lineno, frame_name
+        )
+
+
+@contextmanager
+def compile_context(context: CompileContext):
+    old_context = getattr(_TLS, "compile_context", None)
+    _TLS.compile_context = context
+    try:
+        yield context
+    finally:
+        _TLS.compile_context = old_context
+
+
+@contextmanager
+def tracing(context: Optional[TracingContext]):
+    """
+    This function installs the passed in tracing context as a dynamic scoped
+    global variable.
+
+    Calls to TracingContext.get() while not under a `with tracing()` context
+    will return None.
+    """
+    old_context = getattr(_TLS, "tracing_context", None)
+    _TLS.tracing_context = context
+    try:
+        yield context
+    except Exception as e:
+        if not hasattr(e, "real_stack") and context is not None:
+            e.real_stack = context.extract_stack()  # type: ignore[attr-defined]
+        raise
+    finally:
+        if (
+            context is not None
+            and context.fake_mode is not None
+            and context.fake_mode.shape_env is not None
+        ):
+            context.fake_mode.shape_env.cleanup()
+        _TLS.tracing_context = old_context
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+# TODO(voz): Consider a toplevel torch/_source.py
+@dataclasses.dataclass(frozen=True)
+class Source:
+    def is_dict_key(self):
+        return False
+
+    def is_ephemeral(self):
+        return False
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError()
+
+    def guard_source(self) -> GuardSource:
+        raise NotImplementedError()
+
+    def name(self) -> str:
+        raise NotImplementedError()
+
+    def make_guard(self, fn) -> Guard:
+        if self.guard_source() is GuardSource.CONSTANT:
+            raise NotImplementedError()
+        return Guard(self, fn)
+
+    def is_nn_module(self) -> bool:
+        return self.guard_source().is_nn_module()
+
+    def subguards_allowed(self):
+        """True if you can guard on attributes of this"""
+        return self.guard_source() != GuardSource.SYNTHETIC_LOCAL
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+@dataclasses.dataclass(frozen=True)
+class ChainedSource(Source):
+    base: Source
+
+    def is_dict_key(self):
+        # Recurse until you either hit a ConstDictKey or a Source
+        return self.base.is_dict_key()
+
+    def is_ephemeral(self):
+        return self.base.is_ephemeral()
+
+
+def detect_fake_mode(inputs: Any = None):
+    """
+    Attempts to "detect" what the current fake mode is.  If there is one ambiently
+    available from TracingContext, we preferentially use that.  Otherwise, we
+    heuristically detect the fake mode via the following sources, in order of
+    priority:
+
+        - Currently active fake mode on stack
+        - Fake mode associated with passed in tensors (inputs does not
+          have to be flattened)
+    """
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+
+    fake_modes = []
+
+    if context := TracingContext.try_get():
+        fake_mode = context.fake_mode
+        if fake_mode is not None:
+            fake_modes.append((fake_mode, "tracing context", 0))
+
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for i, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            fake_modes.append((m, "active fake mode", i))
+
+    flat_inputs = pytree.tree_leaves(inputs)
+    for i, flat_input in enumerate(flat_inputs):
+        if isinstance(flat_input, FakeTensor):
+            fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
+
+    if fake_modes:
+        fake_mode, desc1, i1 = fake_modes[0]
+        for m, desc2, i2 in fake_modes[1:]:
+            assert fake_mode is m, (
+                f"fake mode ({fake_mode}) from {desc1} {i1} doesn't match mode ({m}) from {desc2} {i2}\n\n"
+                f"fake mode from {desc1} {i1} allocated at:\n{fake_mode.stack}\n"
+                f"fake mode from {desc2} {i2} allocated at:\n{m.stack}"
+            )
+        return fake_mode
+    else:
+        return None
+
+
+def active_fake_mode():
+    """
+    Inspects the dispatch mode stack for an active fake mode and returns it.
+    Returns None if no fake mode is active.
+    """
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for _, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            return m
+
+    return None
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f56bd8ef2473aa9c35ad6232448c9d5d44b8056
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py
@@ -0,0 +1,137 @@
+import ast
+import functools
+import inspect
+from textwrap import dedent
+from typing import Any, List, NamedTuple, Optional, Tuple
+
+from torch._C import ErrorReport
+from torch._C._jit_tree_views import SourceRangeFactory
+
+
+def get_source_lines_and_file(
+    obj: Any,
+    error_msg: Optional[str] = None,
+) -> Tuple[List[str], int, Optional[str]]:
+    """
+    Wrapper around inspect.getsourcelines and inspect.getsourcefile.
+
+    Returns: (sourcelines, file_lino, filename)
+    """
+    filename = None  # in case getsourcefile throws
+    try:
+        filename = inspect.getsourcefile(obj)
+        sourcelines, file_lineno = inspect.getsourcelines(obj)
+    except OSError as e:
+        msg = (
+            f"Can't get source for {obj}. TorchScript requires source access in "
+            "order to carry out compilation, make sure original .py files are "
+            "available."
+        )
+        if error_msg:
+            msg += "\n" + error_msg
+        raise OSError(msg) from e
+
+    return sourcelines, file_lineno, filename
+
+
+def normalize_source_lines(sourcelines: List[str]) -> List[str]:
+    """
+    This helper function accepts a list of source lines. It finds the
+    indentation level of the function definition (`def`), then it indents
+    all lines in the function body to a point at or greater than that
+    level. This allows for comments and continued string literals that
+    are at a lower indentation than the rest of the code.
+    Args:
+        sourcelines: function source code, separated into lines by
+                        the '\n' character
+    Returns:
+        A list of source lines that have been correctly aligned
+    """
+
+    def remove_prefix(text, prefix):
+        return text[text.startswith(prefix) and len(prefix) :]
+
+    # Find the line and line number containing the function definition
+    idx = None
+    for i, l in enumerate(sourcelines):
+        if l.lstrip().startswith("def"):
+            idx = i
+            break
+
+    # This will happen when the function is a lambda- we won't find "def" anywhere in the source
+    # lines in that case. Currently trying to JIT compile a lambda will throw an error up in
+    # `parse_def()`, but we might want to handle this case in the future.
+    if idx is None:
+        return sourcelines
+
+    # Get a string representing the amount of leading whitespace
+    fn_def = sourcelines[idx]
+    whitespace = fn_def.split("def")[0]
+
+    # Add this leading whitespace to all lines before and after the `def`
+    aligned_prefix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[:idx]
+    ]
+    aligned_suffix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[idx + 1 :]
+    ]
+
+    # Put it together again
+    aligned_prefix.append(fn_def)
+    return aligned_prefix + aligned_suffix
+
+
+# Thin wrapper around SourceRangeFactory to store extra metadata
+# about the function-to-be-compiled.
+class SourceContext(SourceRangeFactory):
+    def __init__(
+        self,
+        source,
+        filename,
+        file_lineno,
+        leading_whitespace_len,
+        uses_true_division=True,
+        funcname=None,
+    ):
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
+        self.uses_true_division = uses_true_division
+        self.filename = filename
+        self.funcname = funcname
+
+
+@functools.lru_cache(maxsize=None)
+def make_source_context(*args):
+    return SourceContext(*args)
+
+
+def fake_range():
+    return SourceContext("", None, 0, 0).make_raw_range(0, 1)
+
+
+class ParsedDef(NamedTuple):
+    ast: ast.Module
+    ctx: SourceContext
+    source: str
+    filename: Optional[str]
+    file_lineno: int
+
+
+def parse_def(fn):
+    sourcelines, file_lineno, filename = get_source_lines_and_file(
+        fn, ErrorReport.call_stack()
+    )
+    sourcelines = normalize_source_lines(sourcelines)
+    source = "".join(sourcelines)
+    dedent_src = dedent(source)
+    py_ast = ast.parse(dedent_src)
+    if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+        raise RuntimeError(
+            f"Expected a single top-level function: {filename}:{file_lineno}"
+        )
+    leading_whitespace_len = len(source.split("\n", 1)[0]) - len(
+        dedent_src.split("\n", 1)[0]
+    )
+    ctx = make_source_context(
+        source, filename, file_lineno, leading_whitespace_len, True, fn.__name__
+    )
+    return ParsedDef(py_ast, ctx, source, filename, file_lineno)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2543177fdd4615f2afdea3ece5916639ff6dc0a6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py
@@ -0,0 +1,6976 @@
+"""Adds docstrings to Tensor functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+from torch._torch_docs import parse_kwargs, reproducibility_notes
+
+
+def add_docstr_all(method, docstr):
+    add_docstr(getattr(torch._C.TensorBase, method), docstr)
+
+
+common_args = parse_kwargs(
+    """
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.preserve_format``.
+"""
+)
+
+new_common_args = parse_kwargs(
+    """
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+        Default: if None, same :class:`torch.dtype` as this tensor.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, same :class:`torch.device` as this tensor.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+    layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+        Default: ``torch.strided``.
+"""
+)
+
+add_docstr_all(
+    "new_tensor",
+    """
+new_tensor(data, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a new Tensor with :attr:`data` as the tensor data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+.. warning::
+
+    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+.. warning::
+
+    When data is a tensor `x`, :func:`new_tensor()` reads out 'the data' from whatever it is passed,
+    and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.clone().detach()``
+    and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+    The equivalents using ``clone()`` and ``detach()`` are recommended.
+
+Args:
+    data (array_like): The returned Tensor copies :attr:`data`.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.int8)
+    >>> data = [[0, 1], [2, 3]]
+    >>> tensor.new_tensor(data)
+    tensor([[ 0,  1],
+            [ 2,  3]], dtype=torch.int8)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_full",
+    """
+new_full(size, fill_value, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    fill_value (scalar): the number to fill the output tensor with.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.float64)
+    >>> tensor.new_full((3, 4), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty",
+    """
+new_empty(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with uninitialized data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty((2, 3))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty_strided",
+    """
+new_empty_strided(size, stride, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` and strides :attr:`stride` filled with
+uninitialized data. By default, the returned Tensor has the same
+:class:`torch.dtype` and :class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty_strided((2, 3), (3, 1))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_ones",
+    """
+new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``1``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.int32)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1,  1,  1],
+            [ 1,  1,  1]], dtype=torch.int32)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_zeros",
+    """
+new_zeros(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``0``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.float64)
+    >>> tensor.new_zeros((2, 3))
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "abs",
+    r"""
+abs() -> Tensor
+
+See :func:`torch.abs`
+""",
+)
+
+add_docstr_all(
+    "abs_",
+    r"""
+abs_() -> Tensor
+
+In-place version of :meth:`~Tensor.abs`
+""",
+)
+
+add_docstr_all(
+    "absolute",
+    r"""
+absolute() -> Tensor
+
+Alias for :func:`abs`
+""",
+)
+
+add_docstr_all(
+    "absolute_",
+    r"""
+absolute_() -> Tensor
+
+In-place version of :meth:`~Tensor.absolute`
+Alias for :func:`abs_`
+""",
+)
+
+add_docstr_all(
+    "acos",
+    r"""
+acos() -> Tensor
+
+See :func:`torch.acos`
+""",
+)
+
+add_docstr_all(
+    "acos_",
+    r"""
+acos_() -> Tensor
+
+In-place version of :meth:`~Tensor.acos`
+""",
+)
+
+add_docstr_all(
+    "arccos",
+    r"""
+arccos() -> Tensor
+
+See :func:`torch.arccos`
+""",
+)
+
+add_docstr_all(
+    "arccos_",
+    r"""
+arccos_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccos`
+""",
+)
+
+add_docstr_all(
+    "acosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.acosh`
+""",
+)
+
+add_docstr_all(
+    "acosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.acosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.arccosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccosh`
+""",
+)
+
+add_docstr_all(
+    "add",
+    r"""
+add(other, *, alpha=1) -> Tensor
+
+Add a scalar or tensor to :attr:`self` tensor. If both :attr:`alpha`
+and :attr:`other` are specified, each element of :attr:`other` is scaled by
+:attr:`alpha` before being used.
+
+When :attr:`other` is a tensor, the shape of :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor
+
+See :func:`torch.add`
+""",
+)
+
+add_docstr_all(
+    "add_",
+    r"""
+add_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.add`
+""",
+)
+
+add_docstr_all(
+    "addbmm",
+    r"""
+addbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addbmm_",
+    r"""
+addbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addcdiv",
+    r"""
+addcdiv(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcdiv_",
+    r"""
+addcdiv_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcmul",
+    r"""
+addcmul(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addcmul_",
+    r"""
+addcmul_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addmm",
+    r"""
+addmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmm_",
+    r"""
+addmm_(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmv",
+    r"""
+addmv(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmv`
+""",
+)
+
+add_docstr_all(
+    "addmv_",
+    r"""
+addmv_(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmv`
+""",
+)
+
+add_docstr_all(
+    "sspaddmm",
+    r"""
+sspaddmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.sspaddmm`
+""",
+)
+
+add_docstr_all(
+    "smm",
+    r"""
+smm(mat) -> Tensor
+
+See :func:`torch.smm`
+""",
+)
+
+add_docstr_all(
+    "addr",
+    r"""
+addr(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addr`
+""",
+)
+
+add_docstr_all(
+    "addr_",
+    r"""
+addr_(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addr`
+""",
+)
+
+add_docstr_all(
+    "align_as",
+    r"""
+align_as(other) -> Tensor
+
+Permutes the dimensions of the :attr:`self` tensor to match the dimension order
+in the :attr:`other` tensor, adding size-one dims for any new names.
+
+This operation is useful for explicit broadcasting by names (see examples).
+
+All of the dims of :attr:`self` must be named in order to use this method.
+The resulting tensor is a view on the original tensor.
+
+All dimension names of :attr:`self` must be present in ``other.names``.
+:attr:`other` may contain named dimensions that are not in ``self.names``;
+the output tensor has a size-one dimension for each of those new names.
+
+To align a tensor to a specific order, use :meth:`~Tensor.align_to`.
+
+Examples::
+
+    # Example 1: Applying a mask
+    >>> mask = torch.randint(2, [127, 128], dtype=torch.bool).refine_names('W', 'H')
+    >>> imgs = torch.randn(32, 128, 127, 3, names=('N', 'H', 'W', 'C'))
+    >>> imgs.masked_fill_(mask.align_as(imgs), 0)
+
+
+    # Example 2: Applying a per-channel-scale
+    >>> def scale_channels(input, scale):
+    >>>    scale = scale.refine_names('C')
+    >>>    return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names=('C',))
+    >>> imgs = torch.rand(32, 128, 128, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(32, num_channels, 128, 128, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 128, 128, 128, names=('N', 'C', 'H', 'W', 'D'))
+
+    # scale_channels is agnostic to the dimension order of the input
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "all",
+    r"""
+all(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.all`
+""",
+)
+
+add_docstr_all(
+    "allclose",
+    r"""
+allclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.allclose`
+""",
+)
+
+add_docstr_all(
+    "angle",
+    r"""
+angle() -> Tensor
+
+See :func:`torch.angle`
+""",
+)
+
+add_docstr_all(
+    "any",
+    r"""
+any(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.any`
+""",
+)
+
+add_docstr_all(
+    "apply_",
+    r"""
+apply_(callable) -> Tensor
+
+Applies the function :attr:`callable` to each element in the tensor, replacing
+each element with the value returned by :attr:`callable`.
+
+.. note::
+
+    This function only works with CPU tensors and should not be used in code
+    sections that require high performance.
+""",
+)
+
+add_docstr_all(
+    "asin",
+    r"""
+asin() -> Tensor
+
+See :func:`torch.asin`
+""",
+)
+
+add_docstr_all(
+    "asin_",
+    r"""
+asin_() -> Tensor
+
+In-place version of :meth:`~Tensor.asin`
+""",
+)
+
+add_docstr_all(
+    "arcsin",
+    r"""
+arcsin() -> Tensor
+
+See :func:`torch.arcsin`
+""",
+)
+
+add_docstr_all(
+    "arcsin_",
+    r"""
+arcsin_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsin`
+""",
+)
+
+add_docstr_all(
+    "asinh",
+    r"""
+asinh() -> Tensor
+
+See :func:`torch.asinh`
+""",
+)
+
+add_docstr_all(
+    "asinh_",
+    r"""
+asinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.asinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh",
+    r"""
+arcsinh() -> Tensor
+
+See :func:`torch.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh_",
+    r"""
+arcsinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "as_strided",
+    r"""
+as_strided(size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided`
+""",
+)
+
+add_docstr_all(
+    "as_strided_",
+    r"""
+as_strided_(size, stride, storage_offset=None) -> Tensor
+
+In-place version of :meth:`~Tensor.as_strided`
+""",
+)
+
+add_docstr_all(
+    "atan",
+    r"""
+atan() -> Tensor
+
+See :func:`torch.atan`
+""",
+)
+
+add_docstr_all(
+    "atan_",
+    r"""
+atan_() -> Tensor
+
+In-place version of :meth:`~Tensor.atan`
+""",
+)
+
+add_docstr_all(
+    "arctan",
+    r"""
+arctan() -> Tensor
+
+See :func:`torch.arctan`
+""",
+)
+
+add_docstr_all(
+    "arctan_",
+    r"""
+arctan_() -> Tensor
+
+In-place version of :meth:`~Tensor.arctan`
+""",
+)
+
+add_docstr_all(
+    "atan2",
+    r"""
+atan2(other) -> Tensor
+
+See :func:`torch.atan2`
+""",
+)
+
+add_docstr_all(
+    "atan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2",
+    r"""
+arctan2(other) -> Tensor
+
+See :func:`torch.arctan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctan2`
+""",
+)
+
+add_docstr_all(
+    "atanh",
+    r"""
+atanh() -> Tensor
+
+See :func:`torch.atanh`
+""",
+)
+
+add_docstr_all(
+    "atanh_",
+    r"""
+atanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh",
+    r"""
+arctanh() -> Tensor
+
+See :func:`torch.arctanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh_",
+    r"""
+arctanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctanh`
+""",
+)
+
+add_docstr_all(
+    "baddbmm",
+    r"""
+baddbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "baddbmm_",
+    r"""
+baddbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "bernoulli",
+    r"""
+bernoulli(*, generator=None) -> Tensor
+
+Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+floating point ``dtype``, and the result will have the same ``dtype``.
+
+See :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bernoulli_",
+    r"""
+bernoulli_(p=0.5, *, generator=None) -> Tensor
+
+Fills each location of :attr:`self` with an independent sample from
+:math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+``dtype``.
+
+:attr:`p` should either be a scalar or tensor containing probabilities to be
+used for drawing the binary random number.
+
+If it is a tensor, the :math:`\text{i}^{th}` element of :attr:`self` tensor
+will be set to a value sampled from
+:math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`. In this case `p` must have
+floating point ``dtype``.
+
+See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bincount",
+    r"""
+bincount(weights=None, minlength=0) -> Tensor
+
+See :func:`torch.bincount`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not",
+    r"""
+bitwise_not() -> Tensor
+
+See :func:`torch.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not_",
+    r"""
+bitwise_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and",
+    r"""
+bitwise_and() -> Tensor
+
+See :func:`torch.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and_",
+    r"""
+bitwise_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or",
+    r"""
+bitwise_or() -> Tensor
+
+See :func:`torch.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or_",
+    r"""
+bitwise_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor",
+    r"""
+bitwise_xor() -> Tensor
+
+See :func:`torch.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor_",
+    r"""
+bitwise_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift",
+    r"""
+bitwise_left_shift(other) -> Tensor
+
+See :func:`torch.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift_",
+    r"""
+bitwise_left_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift",
+    r"""
+bitwise_right_shift(other) -> Tensor
+
+See :func:`torch.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift_",
+    r"""
+bitwise_right_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "broadcast_to",
+    r"""
+broadcast_to(shape) -> Tensor
+
+See :func:`torch.broadcast_to`.
+""",
+)
+
+add_docstr_all(
+    "logical_and",
+    r"""
+logical_and() -> Tensor
+
+See :func:`torch.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_and_",
+    r"""
+logical_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_not",
+    r"""
+logical_not() -> Tensor
+
+See :func:`torch.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_not_",
+    r"""
+logical_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_or",
+    r"""
+logical_or() -> Tensor
+
+See :func:`torch.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_or_",
+    r"""
+logical_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_xor",
+    r"""
+logical_xor() -> Tensor
+
+See :func:`torch.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "logical_xor_",
+    r"""
+logical_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "bmm",
+    r"""
+bmm(batch2) -> Tensor
+
+See :func:`torch.bmm`
+""",
+)
+
+add_docstr_all(
+    "cauchy_",
+    r"""
+cauchy_(median=0, sigma=1, *, generator=None) -> Tensor
+
+Fills the tensor with numbers drawn from the Cauchy distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
+
+.. note::
+  Sigma (:math:`\sigma`) is used to denote the scale parameter in Cauchy distribution.
+""",
+)
+
+add_docstr_all(
+    "ceil",
+    r"""
+ceil() -> Tensor
+
+See :func:`torch.ceil`
+""",
+)
+
+add_docstr_all(
+    "ceil_",
+    r"""
+ceil_() -> Tensor
+
+In-place version of :meth:`~Tensor.ceil`
+""",
+)
+
+add_docstr_all(
+    "cholesky",
+    r"""
+cholesky(upper=False) -> Tensor
+
+See :func:`torch.cholesky`
+""",
+)
+
+add_docstr_all(
+    "cholesky_solve",
+    r"""
+cholesky_solve(input2, upper=False) -> Tensor
+
+See :func:`torch.cholesky_solve`
+""",
+)
+
+add_docstr_all(
+    "cholesky_inverse",
+    r"""
+cholesky_inverse(upper=False) -> Tensor
+
+See :func:`torch.cholesky_inverse`
+""",
+)
+
+add_docstr_all(
+    "clamp",
+    r"""
+clamp(min=None, max=None) -> Tensor
+
+See :func:`torch.clamp`
+""",
+)
+
+add_docstr_all(
+    "clamp_",
+    r"""
+clamp_(min=None, max=None) -> Tensor
+
+In-place version of :meth:`~Tensor.clamp`
+""",
+)
+
+add_docstr_all(
+    "clip",
+    r"""
+clip(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp`.
+""",
+)
+
+add_docstr_all(
+    "clip_",
+    r"""
+clip_(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp_`.
+""",
+)
+
+add_docstr_all(
+    "clone",
+    r"""
+clone(*, memory_format=torch.preserve_format) -> Tensor
+
+See :func:`torch.clone`
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "coalesce",
+    r"""
+coalesce() -> Tensor
+
+Returns a coalesced copy of :attr:`self` if :attr:`self` is an
+:ref:`uncoalesced tensor <sparse-uncoalesced-coo-docs>`.
+
+Returns :attr:`self` if :attr:`self` is a coalesced tensor.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+""",
+)
+
+add_docstr_all(
+    "contiguous",
+    r"""
+contiguous(memory_format=torch.contiguous_format) -> Tensor
+
+Returns a contiguous in memory tensor containing the same data as :attr:`self` tensor. If
+:attr:`self` tensor is already in the specified memory format, this function returns the
+:attr:`self` tensor.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "copy_",
+    r"""
+copy_(src, non_blocking=False) -> Tensor
+
+Copies the elements from :attr:`src` into :attr:`self` tensor and returns
+:attr:`self`.
+
+The :attr:`src` tensor must be :ref:`broadcastable <broadcasting-semantics>`
+with the :attr:`self` tensor. It may be of a different data type or reside on a
+different device.
+
+Args:
+    src (Tensor): the source tensor to copy from
+    non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+        the copy may occur asynchronously with respect to the host. For other
+        cases, this argument has no effect.
+""",
+)
+
+add_docstr_all(
+    "conj",
+    r"""
+conj() -> Tensor
+
+See :func:`torch.conj`
+""",
+)
+
+add_docstr_all(
+    "conj_physical",
+    r"""
+conj_physical() -> Tensor
+
+See :func:`torch.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "conj_physical_",
+    r"""
+conj_physical_() -> Tensor
+
+In-place version of :meth:`~Tensor.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "resolve_conj",
+    r"""
+resolve_conj() -> Tensor
+
+See :func:`torch.resolve_conj`
+""",
+)
+
+add_docstr_all(
+    "resolve_neg",
+    r"""
+resolve_neg() -> Tensor
+
+See :func:`torch.resolve_neg`
+""",
+)
+
+add_docstr_all(
+    "copysign",
+    r"""
+copysign(other) -> Tensor
+
+See :func:`torch.copysign`
+""",
+)
+
+add_docstr_all(
+    "copysign_",
+    r"""
+copysign_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.copysign`
+""",
+)
+
+add_docstr_all(
+    "cos",
+    r"""
+cos() -> Tensor
+
+See :func:`torch.cos`
+""",
+)
+
+add_docstr_all(
+    "cos_",
+    r"""
+cos_() -> Tensor
+
+In-place version of :meth:`~Tensor.cos`
+""",
+)
+
+add_docstr_all(
+    "cosh",
+    r"""
+cosh() -> Tensor
+
+See :func:`torch.cosh`
+""",
+)
+
+add_docstr_all(
+    "cosh_",
+    r"""
+cosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.cosh`
+""",
+)
+
+add_docstr_all(
+    "cpu",
+    r"""
+cpu(memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CPU memory.
+
+If this object is already in CPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    {memory_format}
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "count_nonzero",
+    r"""
+count_nonzero(dim=None) -> Tensor
+
+See :func:`torch.count_nonzero`
+""",
+)
+
+add_docstr_all(
+    "cov",
+    r"""
+cov(*, correction=1, fweights=None, aweights=None) -> Tensor
+
+See :func:`torch.cov`
+""",
+)
+
+add_docstr_all(
+    "corrcoef",
+    r"""
+corrcoef() -> Tensor
+
+See :func:`torch.corrcoef`
+""",
+)
+
+add_docstr_all(
+    "cross",
+    r"""
+cross(other, dim=None) -> Tensor
+
+See :func:`torch.cross`
+""",
+)
+
+add_docstr_all(
+    "cuda",
+    r"""
+cuda(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CUDA memory.
+
+If this object is already in CUDA memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination GPU device.
+        Defaults to the current CUDA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "ipu",
+    r"""
+ipu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in IPU memory.
+
+If this object is already in IPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination IPU device.
+        Defaults to the current IPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "xpu",
+    r"""
+xpu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in XPU memory.
+
+If this object is already in XPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination XPU device.
+        Defaults to the current XPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "logcumsumexp",
+    r"""
+logcumsumexp(dim) -> Tensor
+
+See :func:`torch.logcumsumexp`
+""",
+)
+
+add_docstr_all(
+    "cummax",
+    r"""
+cummax(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummax`
+""",
+)
+
+add_docstr_all(
+    "cummin",
+    r"""
+cummin(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummin`
+""",
+)
+
+add_docstr_all(
+    "cumprod",
+    r"""
+cumprod(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumprod_",
+    r"""
+cumprod_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumsum",
+    r"""
+cumsum(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumsum`
+""",
+)
+
+add_docstr_all(
+    "cumsum_",
+    r"""
+cumsum_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumsum`
+""",
+)
+
+add_docstr_all(
+    "data_ptr",
+    r"""
+data_ptr() -> int
+
+Returns the address of the first element of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dequantize",
+    r"""
+dequantize() -> Tensor
+
+Given a quantized Tensor, dequantize it and return the dequantized float Tensor.
+""",
+)
+
+add_docstr_all(
+    "dense_dim",
+    r"""
+dense_dim() -> int
+
+Return the number of dense dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+
+.. note::
+  Returns ``len(self.shape)`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.sparse_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "diag",
+    r"""
+diag(diagonal=0) -> Tensor
+
+See :func:`torch.diag`
+""",
+)
+
+add_docstr_all(
+    "diag_embed",
+    r"""
+diag_embed(offset=0, dim1=-2, dim2=-1) -> Tensor
+
+See :func:`torch.diag_embed`
+""",
+)
+
+add_docstr_all(
+    "diagflat",
+    r"""
+diagflat(offset=0) -> Tensor
+
+See :func:`torch.diagflat`
+""",
+)
+
+add_docstr_all(
+    "diagonal",
+    r"""
+diagonal(offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal`
+""",
+)
+
+add_docstr_all(
+    "diagonal_scatter",
+    r"""
+diagonal_scatter(src, offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal_scatter`
+""",
+)
+
+add_docstr_all(
+    "as_strided_scatter",
+    r"""
+as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided_scatter`
+""",
+)
+
+add_docstr_all(
+    "fill_diagonal_",
+    r"""
+fill_diagonal_(fill_value, wrap=False) -> Tensor
+
+Fill the main diagonal of a tensor that has at least 2-dimensions.
+When dims>2, all dimensions of input must be of equal length.
+This function modifies the input tensor in-place, and returns the input tensor.
+
+Arguments:
+    fill_value (Scalar): the fill value
+    wrap (bool): the diagonal 'wrapped' after N columns for tall matrices.
+
+Example::
+
+    >>> a = torch.zeros(3, 3)
+    >>> a.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+    >>> b = torch.zeros(7, 3)
+    >>> b.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.]])
+    >>> c = torch.zeros(7, 3)
+    >>> c.fill_diagonal_(5, wrap=True)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+
+""",
+)
+
+add_docstr_all(
+    "floor_divide",
+    r"""
+floor_divide(value) -> Tensor
+
+See :func:`torch.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "floor_divide_",
+    r"""
+floor_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "diff",
+    r"""
+diff(n=1, dim=-1, prepend=None, append=None) -> Tensor
+
+See :func:`torch.diff`
+""",
+)
+
+add_docstr_all(
+    "digamma",
+    r"""
+digamma() -> Tensor
+
+See :func:`torch.digamma`
+""",
+)
+
+add_docstr_all(
+    "digamma_",
+    r"""
+digamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.digamma`
+""",
+)
+
+add_docstr_all(
+    "dim",
+    r"""
+dim() -> int
+
+Returns the number of dimensions of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dist",
+    r"""
+dist(other, p=2) -> Tensor
+
+See :func:`torch.dist`
+""",
+)
+
+add_docstr_all(
+    "div",
+    r"""
+div(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.div`
+""",
+)
+
+add_docstr_all(
+    "div_",
+    r"""
+div_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.div`
+""",
+)
+
+add_docstr_all(
+    "divide",
+    r"""
+divide(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.divide`
+""",
+)
+
+add_docstr_all(
+    "divide_",
+    r"""
+divide_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.divide`
+""",
+)
+
+add_docstr_all(
+    "dot",
+    r"""
+dot(other) -> Tensor
+
+See :func:`torch.dot`
+""",
+)
+
+add_docstr_all(
+    "element_size",
+    r"""
+element_size() -> int
+
+Returns the size in bytes of an individual element.
+
+Example::
+
+    >>> torch.tensor([]).element_size()
+    4
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
+    1
+
+""",
+)
+
+add_docstr_all(
+    "eq",
+    r"""
+eq(other) -> Tensor
+
+See :func:`torch.eq`
+""",
+)
+
+add_docstr_all(
+    "eq_",
+    r"""
+eq_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.eq`
+""",
+)
+
+add_docstr_all(
+    "equal",
+    r"""
+equal(other) -> bool
+
+See :func:`torch.equal`
+""",
+)
+
+add_docstr_all(
+    "erf",
+    r"""
+erf() -> Tensor
+
+See :func:`torch.erf`
+""",
+)
+
+add_docstr_all(
+    "erf_",
+    r"""
+erf_() -> Tensor
+
+In-place version of :meth:`~Tensor.erf`
+""",
+)
+
+add_docstr_all(
+    "erfc",
+    r"""
+erfc() -> Tensor
+
+See :func:`torch.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfc_",
+    r"""
+erfc_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfinv",
+    r"""
+erfinv() -> Tensor
+
+See :func:`torch.erfinv`
+""",
+)
+
+add_docstr_all(
+    "erfinv_",
+    r"""
+erfinv_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfinv`
+""",
+)
+
+add_docstr_all(
+    "exp",
+    r"""
+exp() -> Tensor
+
+See :func:`torch.exp`
+""",
+)
+
+add_docstr_all(
+    "exp_",
+    r"""
+exp_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp`
+""",
+)
+
+add_docstr_all(
+    "exp2",
+    r"""
+exp2() -> Tensor
+
+See :func:`torch.exp2`
+""",
+)
+
+add_docstr_all(
+    "exp2_",
+    r"""
+exp2_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp2`
+""",
+)
+
+add_docstr_all(
+    "expm1",
+    r"""
+expm1() -> Tensor
+
+See :func:`torch.expm1`
+""",
+)
+
+add_docstr_all(
+    "expm1_",
+    r"""
+expm1_() -> Tensor
+
+In-place version of :meth:`~Tensor.expm1`
+""",
+)
+
+add_docstr_all(
+    "exponential_",
+    r"""
+exponential_(lambd=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the PDF (probability density function):
+
+.. math::
+
+    f(x) = \lambda e^{-\lambda x}, x > 0
+
+.. note::
+  In probability theory, exponential distribution is supported on interval [0, :math:`\inf`) (i.e., :math:`x >= 0`)
+  implying that zero can be sampled from the exponential distribution.
+  However, :func:`torch.Tensor.exponential_` does not sample zero,
+  which means that its actual support is the interval (0, :math:`\inf`).
+
+  Note that :func:`torch.distributions.exponential.Exponential` is supported on the interval [0, :math:`\inf`) and can sample zero.
+""",
+)
+
+add_docstr_all(
+    "fill_",
+    r"""
+fill_(value) -> Tensor
+
+Fills :attr:`self` tensor with the specified value.
+""",
+)
+
+add_docstr_all(
+    "floor",
+    r"""
+floor() -> Tensor
+
+See :func:`torch.floor`
+""",
+)
+
+add_docstr_all(
+    "flip",
+    r"""
+flip(dims) -> Tensor
+
+See :func:`torch.flip`
+""",
+)
+
+add_docstr_all(
+    "fliplr",
+    r"""
+fliplr() -> Tensor
+
+See :func:`torch.fliplr`
+""",
+)
+
+add_docstr_all(
+    "flipud",
+    r"""
+flipud() -> Tensor
+
+See :func:`torch.flipud`
+""",
+)
+
+add_docstr_all(
+    "roll",
+    r"""
+roll(shifts, dims) -> Tensor
+
+See :func:`torch.roll`
+""",
+)
+
+add_docstr_all(
+    "floor_",
+    r"""
+floor_() -> Tensor
+
+In-place version of :meth:`~Tensor.floor`
+""",
+)
+
+add_docstr_all(
+    "fmod",
+    r"""
+fmod(divisor) -> Tensor
+
+See :func:`torch.fmod`
+""",
+)
+
+add_docstr_all(
+    "fmod_",
+    r"""
+fmod_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.fmod`
+""",
+)
+
+add_docstr_all(
+    "frac",
+    r"""
+frac() -> Tensor
+
+See :func:`torch.frac`
+""",
+)
+
+add_docstr_all(
+    "frac_",
+    r"""
+frac_() -> Tensor
+
+In-place version of :meth:`~Tensor.frac`
+""",
+)
+
+add_docstr_all(
+    "frexp",
+    r"""
+frexp(input) -> (Tensor mantissa, Tensor exponent)
+
+See :func:`torch.frexp`
+""",
+)
+
+add_docstr_all(
+    "flatten",
+    r"""
+flatten(start_dim=0, end_dim=-1) -> Tensor
+
+See :func:`torch.flatten`
+""",
+)
+
+add_docstr_all(
+    "gather",
+    r"""
+gather(dim, index) -> Tensor
+
+See :func:`torch.gather`
+""",
+)
+
+add_docstr_all(
+    "gcd",
+    r"""
+gcd(other) -> Tensor
+
+See :func:`torch.gcd`
+""",
+)
+
+add_docstr_all(
+    "gcd_",
+    r"""
+gcd_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gcd`
+""",
+)
+
+add_docstr_all(
+    "ge",
+    r"""
+ge(other) -> Tensor
+
+See :func:`torch.ge`.
+""",
+)
+
+add_docstr_all(
+    "ge_",
+    r"""
+ge_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ge`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal",
+    r"""
+greater_equal(other) -> Tensor
+
+See :func:`torch.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal_",
+    r"""
+greater_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "geometric_",
+    r"""
+geometric_(p, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the geometric distribution:
+
+.. math::
+
+    P(X=k) = (1 - p)^{k - 1} p, k = 1, 2, ...
+
+.. note::
+  :func:`torch.Tensor.geometric_` `k`-th trial is the first success hence draws samples in :math:`\{1, 2, \ldots\}`, whereas
+  :func:`torch.distributions.geometric.Geometric` :math:`(k+1)`-th trial is the first success
+  hence draws samples in :math:`\{0, 1, \ldots\}`.
+""",
+)
+
+add_docstr_all(
+    "geqrf",
+    r"""
+geqrf() -> (Tensor, Tensor)
+
+See :func:`torch.geqrf`
+""",
+)
+
+add_docstr_all(
+    "ger",
+    r"""
+ger(vec2) -> Tensor
+
+See :func:`torch.ger`
+""",
+)
+
+add_docstr_all(
+    "inner",
+    r"""
+inner(other) -> Tensor
+
+See :func:`torch.inner`.
+""",
+)
+
+add_docstr_all(
+    "outer",
+    r"""
+outer(vec2) -> Tensor
+
+See :func:`torch.outer`.
+""",
+)
+
+add_docstr_all(
+    "hypot",
+    r"""
+hypot(other) -> Tensor
+
+See :func:`torch.hypot`
+""",
+)
+
+add_docstr_all(
+    "hypot_",
+    r"""
+hypot_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.hypot`
+""",
+)
+
+add_docstr_all(
+    "i0",
+    r"""
+i0() -> Tensor
+
+See :func:`torch.i0`
+""",
+)
+
+add_docstr_all(
+    "i0_",
+    r"""
+i0_() -> Tensor
+
+In-place version of :meth:`~Tensor.i0`
+""",
+)
+
+add_docstr_all(
+    "igamma",
+    r"""
+igamma(other) -> Tensor
+
+See :func:`torch.igamma`
+""",
+)
+
+add_docstr_all(
+    "igamma_",
+    r"""
+igamma_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.igamma`
+""",
+)
+
+add_docstr_all(
+    "igammac",
+    r"""
+igammac(other) -> Tensor
+See :func:`torch.igammac`
+""",
+)
+
+add_docstr_all(
+    "igammac_",
+    r"""
+igammac_(other) -> Tensor
+In-place version of :meth:`~Tensor.igammac`
+""",
+)
+
+add_docstr_all(
+    "indices",
+    r"""
+indices() -> Tensor
+
+Return the indices tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.values`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "get_device",
+    r"""
+get_device() -> Device ordinal (Integer)
+
+For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
+For CPU tensors, this function returns `-1`.
+
+Example::
+
+    >>> x = torch.randn(3, 4, 5, device='cuda:0')
+    >>> x.get_device()
+    0
+    >>> x.cpu().get_device()
+    -1
+""",
+)
+
+add_docstr_all(
+    "values",
+    r"""
+values() -> Tensor
+
+Return the values tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.indices`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "gt",
+    r"""
+gt(other) -> Tensor
+
+See :func:`torch.gt`.
+""",
+)
+
+add_docstr_all(
+    "gt_",
+    r"""
+gt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gt`.
+""",
+)
+
+add_docstr_all(
+    "greater",
+    r"""
+greater(other) -> Tensor
+
+See :func:`torch.greater`.
+""",
+)
+
+add_docstr_all(
+    "greater_",
+    r"""
+greater_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater`.
+""",
+)
+
+add_docstr_all(
+    "has_names",
+    r"""
+Is ``True`` if any of this tensor's dimensions are named. Otherwise, is ``False``.
+""",
+)
+
+add_docstr_all(
+    "hardshrink",
+    r"""
+hardshrink(lambd=0.5) -> Tensor
+
+See :func:`torch.nn.functional.hardshrink`
+""",
+)
+
+add_docstr_all(
+    "heaviside",
+    r"""
+heaviside(values) -> Tensor
+
+See :func:`torch.heaviside`
+""",
+)
+
+add_docstr_all(
+    "heaviside_",
+    r"""
+heaviside_(values) -> Tensor
+
+In-place version of :meth:`~Tensor.heaviside`
+""",
+)
+
+add_docstr_all(
+    "histc",
+    r"""
+histc(bins=100, min=0, max=0) -> Tensor
+
+See :func:`torch.histc`
+""",
+)
+
+add_docstr_all(
+    "histogram",
+    r"""
+histogram(input, bins, *, range=None, weight=None, density=False) -> (Tensor, Tensor)
+
+See :func:`torch.histogram`
+""",
+)
+
+add_docstr_all(
+    "index_add_",
+    r"""
+index_add_(dim, index, source, *, alpha=1) -> Tensor
+
+Accumulate the elements of :attr:`alpha` times ``source`` into the :attr:`self`
+tensor by adding to the indices in the order given in :attr:`index`. For example,
+if ``dim == 0``, ``index[i] == j``, and ``alpha=-1``, then the ``i``\ th row of
+``source`` is subtracted from the ``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor the output is given as::
+
+    self[index[i], :, :] += alpha * src[i, :, :]  # if dim == 0
+    self[:, index[i], :] += alpha * src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] += alpha * src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+            should have dtype either `torch.int64` or `torch.int32`
+    source (Tensor): the tensor containing values to add
+
+Keyword args:
+    alpha (Number): the scalar multiplier for ``source``
+
+Example::
+
+    >>> x = torch.ones(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_add_(0, index, t)
+    tensor([[  2.,   3.,   4.],
+            [  1.,   1.,   1.],
+            [  8.,   9.,  10.],
+            [  1.,   1.,   1.],
+            [  5.,   6.,   7.]])
+    >>> x.index_add_(0, index, t, alpha=-1)
+    tensor([[  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_copy_",
+    r"""
+index_copy_(dim, index, tensor) -> Tensor
+
+Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+.. note::
+    If :attr:`index` contains duplicate entries, multiple elements from
+    :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+    is nondeterministic since it depends on which copy occurs last.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`tensor` to select from
+    tensor (Tensor): the tensor containing values to copy
+
+Example::
+
+    >>> x = torch.zeros(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_copy_(0, index, t)
+    tensor([[ 1.,  2.,  3.],
+            [ 0.,  0.,  0.],
+            [ 7.,  8.,  9.],
+            [ 0.,  0.,  0.],
+            [ 4.,  5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "index_fill_",
+    r"""
+index_fill_(dim, index, value) -> Tensor
+
+Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+selecting the indices in the order given in :attr:`index`.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`self` tensor to fill in
+    value (float): the value to fill with
+
+Example::
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 2])
+    >>> x.index_fill_(1, index, -1)
+    tensor([[-1.,  2., -1.],
+            [-1.,  5., -1.],
+            [-1.,  8., -1.]])
+""",
+)
+
+add_docstr_all(
+    "index_put_",
+    r"""
+index_put_(indices, values, accumulate=False) -> Tensor
+
+Puts values from the tensor :attr:`values` into the tensor :attr:`self` using
+the indices specified in :attr:`indices` (which is a tuple of Tensors). The
+expression ``tensor.index_put_(indices, values)`` is equivalent to
+``tensor[indices] = values``. Returns :attr:`self`.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
+contain duplicate elements.
+
+Args:
+    indices (tuple of LongTensor): tensors used to index into `self`.
+    values (Tensor): tensor of same dtype as `self`.
+    accumulate (bool): whether to accumulate into self
+""",
+)
+
+add_docstr_all(
+    "index_put",
+    r"""
+index_put(indices, values, accumulate=False) -> Tensor
+
+Out-place version of :meth:`~Tensor.index_put_`.
+""",
+)
+
+add_docstr_all(
+    "index_reduce_",
+    r"""
+index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor
+
+Accumulate the elements of ``source`` into the :attr:`self`
+tensor by accumulating to the indices in the order given in :attr:`index`
+using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``,
+``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th
+row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
+:obj:`include_self="True"`, the values in the :attr:`self` tensor are included
+in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+to are treated as if they were filled with the reduction identites.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i], :, :] *= src[i, :, :]  # if dim == 0
+    self[:, index[i], :] *= src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] *= src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    This function only supports floating point tensors.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+        should have dtype either `torch.int64` or `torch.int32`
+    source (FloatTensor): the tensor containing values to accumulate
+    reduce (str): the reduction operation to apply
+        (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+
+Keyword args:
+    include_self (bool): whether the elements from the ``self`` tensor are
+        included in the reduction
+
+Example::
+
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2, 0])
+    >>> x.index_reduce_(0, index, t, 'prod')
+    tensor([[20., 44., 72.],
+            [ 2.,  2.,  2.],
+            [14., 16., 18.],
+            [ 2.,  2.,  2.],
+            [ 8., 10., 12.]])
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> x.index_reduce_(0, index, t, 'prod', include_self=False)
+    tensor([[10., 22., 36.],
+            [ 2.,  2.,  2.],
+            [ 7.,  8.,  9.],
+            [ 2.,  2.,  2.],
+            [ 4.,  5.,  6.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_select",
+    r"""
+index_select(dim, index) -> Tensor
+
+See :func:`torch.index_select`
+""",
+)
+
+add_docstr_all(
+    "sparse_mask",
+    r"""
+sparse_mask(mask) -> Tensor
+
+Returns a new :ref:`sparse tensor <sparse-docs>` with values from a
+strided tensor :attr:`self` filtered by the indices of the sparse
+tensor :attr:`mask`. The values of :attr:`mask` sparse tensor are
+ignored. :attr:`self` and :attr:`mask` tensors must have the same
+shape.
+
+.. note::
+
+  The returned sparse tensor might contain duplicate values if :attr:`mask`
+  is not coalesced. It is therefore advisable to pass ``mask.coalesce()``
+  if such behavior is not desired.
+
+.. note::
+
+  The returned sparse tensor has the same indices as the sparse tensor
+  :attr:`mask`, even when the corresponding values in :attr:`self` are
+  zeros.
+
+Args:
+    mask (Tensor): a sparse tensor whose indices are used as a filter
+
+Example::
+
+    >>> nse = 5
+    >>> dims = (5, 5, 2, 2)
+    >>> I = torch.cat([torch.randint(0, dims[0], size=(nse,)),
+    ...                torch.randint(0, dims[1], size=(nse,))], 0).reshape(2, nse)
+    >>> V = torch.randn(nse, dims[2], dims[3])
+    >>> S = torch.sparse_coo_tensor(I, V, dims).coalesce()
+    >>> D = torch.randn(dims)
+    >>> D.sparse_mask(S)
+    tensor(indices=tensor([[0, 0, 0, 2],
+                           [0, 1, 4, 3]]),
+           values=tensor([[[ 1.6550,  0.2397],
+                           [-0.1611, -0.0779]],
+
+                          [[ 0.2326, -1.0558],
+                           [ 1.4711,  1.9678]],
+
+                          [[-0.5138, -0.0411],
+                           [ 1.9417,  0.5158]],
+
+                          [[ 0.0793,  0.0036],
+                           [-0.2569, -0.1055]]]),
+           size=(5, 5, 2, 2), nnz=4, layout=torch.sparse_coo)
+""",
+)
+
+add_docstr_all(
+    "inverse",
+    r"""
+inverse() -> Tensor
+
+See :func:`torch.inverse`
+""",
+)
+
+add_docstr_all(
+    "isnan",
+    r"""
+isnan() -> Tensor
+
+See :func:`torch.isnan`
+""",
+)
+
+add_docstr_all(
+    "isinf",
+    r"""
+isinf() -> Tensor
+
+See :func:`torch.isinf`
+""",
+)
+
+add_docstr_all(
+    "isposinf",
+    r"""
+isposinf() -> Tensor
+
+See :func:`torch.isposinf`
+""",
+)
+
+add_docstr_all(
+    "isneginf",
+    r"""
+isneginf() -> Tensor
+
+See :func:`torch.isneginf`
+""",
+)
+
+add_docstr_all(
+    "isfinite",
+    r"""
+isfinite() -> Tensor
+
+See :func:`torch.isfinite`
+""",
+)
+
+add_docstr_all(
+    "isclose",
+    r"""
+isclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.isclose`
+""",
+)
+
+add_docstr_all(
+    "isreal",
+    r"""
+isreal() -> Tensor
+
+See :func:`torch.isreal`
+""",
+)
+
+add_docstr_all(
+    "is_coalesced",
+    r"""
+is_coalesced() -> bool
+
+Returns ``True`` if :attr:`self` is a :ref:`sparse COO tensor
+<sparse-coo-docs>` that is coalesced, ``False`` otherwise.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See :meth:`coalesce` and :ref:`uncoalesced tensors <sparse-uncoalesced-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "is_contiguous",
+    r"""
+is_contiguous(memory_format=torch.contiguous_format) -> bool
+
+Returns True if :attr:`self` tensor is contiguous in memory in the order specified
+by memory format.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): Specifies memory allocation
+        order. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "is_pinned",
+    r"""
+Returns true if this tensor resides in pinned memory.
+""",
+)
+
+add_docstr_all(
+    "is_floating_point",
+    r"""
+is_floating_point() -> bool
+
+Returns True if the data type of :attr:`self` is a floating point data type.
+""",
+)
+
+add_docstr_all(
+    "is_complex",
+    r"""
+is_complex() -> bool
+
+Returns True if the data type of :attr:`self` is a complex data type.
+""",
+)
+
+add_docstr_all(
+    "is_inference",
+    r"""
+is_inference() -> bool
+
+See :func:`torch.is_inference`
+""",
+)
+
+add_docstr_all(
+    "is_conj",
+    r"""
+is_conj() -> bool
+
+Returns True if the conjugate bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_neg",
+    r"""
+is_neg() -> bool
+
+Returns True if the negative bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_signed",
+    r"""
+is_signed() -> bool
+
+Returns True if the data type of :attr:`self` is a signed data type.
+""",
+)
+
+add_docstr_all(
+    "is_set_to",
+    r"""
+is_set_to(tensor) -> bool
+
+Returns True if both tensors are pointing to the exact same memory (same
+storage, offset, size and stride).
+""",
+)
+
+add_docstr_all(
+    "item",
+    r"""
+item() -> number
+
+Returns the value of this tensor as a standard Python number. This only works
+for tensors with one element. For other cases, see :meth:`~Tensor.tolist`.
+
+This operation is not differentiable.
+
+Example::
+
+    >>> x = torch.tensor([1.0])
+    >>> x.item()
+    1.0
+
+""",
+)
+
+add_docstr_all(
+    "kron",
+    r"""
+kron(other) -> Tensor
+
+See :func:`torch.kron`
+""",
+)
+
+add_docstr_all(
+    "kthvalue",
+    r"""
+kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.kthvalue`
+""",
+)
+
+add_docstr_all(
+    "ldexp",
+    r"""
+ldexp(other) -> Tensor
+
+See :func:`torch.ldexp`
+""",
+)
+
+add_docstr_all(
+    "ldexp_",
+    r"""
+ldexp_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ldexp`
+""",
+)
+
+add_docstr_all(
+    "lcm",
+    r"""
+lcm(other) -> Tensor
+
+See :func:`torch.lcm`
+""",
+)
+
+add_docstr_all(
+    "lcm_",
+    r"""
+lcm_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lcm`
+""",
+)
+
+add_docstr_all(
+    "le",
+    r"""
+le(other) -> Tensor
+
+See :func:`torch.le`.
+""",
+)
+
+add_docstr_all(
+    "le_",
+    r"""
+le_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.le`.
+""",
+)
+
+add_docstr_all(
+    "less_equal",
+    r"""
+less_equal(other) -> Tensor
+
+See :func:`torch.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "less_equal_",
+    r"""
+less_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "lerp",
+    r"""
+lerp(end, weight) -> Tensor
+
+See :func:`torch.lerp`
+""",
+)
+
+add_docstr_all(
+    "lerp_",
+    r"""
+lerp_(end, weight) -> Tensor
+
+In-place version of :meth:`~Tensor.lerp`
+""",
+)
+
+add_docstr_all(
+    "lgamma",
+    r"""
+lgamma() -> Tensor
+
+See :func:`torch.lgamma`
+""",
+)
+
+add_docstr_all(
+    "lgamma_",
+    r"""
+lgamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.lgamma`
+""",
+)
+
+add_docstr_all(
+    "log",
+    r"""
+log() -> Tensor
+
+See :func:`torch.log`
+""",
+)
+
+add_docstr_all(
+    "log_",
+    r"""
+log_() -> Tensor
+
+In-place version of :meth:`~Tensor.log`
+""",
+)
+
+add_docstr_all(
+    "log10",
+    r"""
+log10() -> Tensor
+
+See :func:`torch.log10`
+""",
+)
+
+add_docstr_all(
+    "log10_",
+    r"""
+log10_() -> Tensor
+
+In-place version of :meth:`~Tensor.log10`
+""",
+)
+
+add_docstr_all(
+    "log1p",
+    r"""
+log1p() -> Tensor
+
+See :func:`torch.log1p`
+""",
+)
+
+add_docstr_all(
+    "log1p_",
+    r"""
+log1p_() -> Tensor
+
+In-place version of :meth:`~Tensor.log1p`
+""",
+)
+
+add_docstr_all(
+    "log2",
+    r"""
+log2() -> Tensor
+
+See :func:`torch.log2`
+""",
+)
+
+add_docstr_all(
+    "log2_",
+    r"""
+log2_() -> Tensor
+
+In-place version of :meth:`~Tensor.log2`
+""",
+)
+
+add_docstr_all(
+    "logaddexp",
+    r"""
+logaddexp(other) -> Tensor
+
+See :func:`torch.logaddexp`
+""",
+)
+
+add_docstr_all(
+    "logaddexp2",
+    r"""
+logaddexp2(other) -> Tensor
+
+See :func:`torch.logaddexp2`
+""",
+)
+
+add_docstr_all(
+    "log_normal_",
+    r"""
+log_normal_(mean=1, std=2, *, generator=None)
+
+Fills :attr:`self` tensor with numbers samples from the log-normal distribution
+parameterized by the given mean :math:`\mu` and standard deviation
+:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+standard deviation of the underlying normal distribution, and not of the
+returned distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
+""",
+)
+
+add_docstr_all(
+    "logsumexp",
+    r"""
+logsumexp(dim, keepdim=False) -> Tensor
+
+See :func:`torch.logsumexp`
+""",
+)
+
+add_docstr_all(
+    "lt",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.lt`.
+""",
+)
+
+add_docstr_all(
+    "lt_",
+    r"""
+lt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lt`.
+""",
+)
+
+add_docstr_all(
+    "less",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.less`.
+""",
+)
+
+add_docstr_all(
+    "less_",
+    r"""
+less_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less`.
+""",
+)
+
+add_docstr_all(
+    "lu_solve",
+    r"""
+lu_solve(LU_data, LU_pivots) -> Tensor
+
+See :func:`torch.lu_solve`
+""",
+)
+
+add_docstr_all(
+    "map_",
+    r"""
+map_(tensor, callable)
+
+Applies :attr:`callable` for each element in :attr:`self` tensor and the given
+:attr:`tensor` and stores the results in :attr:`self` tensor. :attr:`self` tensor and
+the given :attr:`tensor` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+The :attr:`callable` should have the signature::
+
+    def callable(a, b) -> number
+""",
+)
+
+add_docstr_all(
+    "masked_scatter_",
+    r"""
+masked_scatter_(mask, source)
+
+Copies elements from :attr:`source` into :attr:`self` tensor at positions where
+the :attr:`mask` is True. Elements from :attr:`source` are copied into :attr:`self`
+starting at position 0 of :attr:`source` and continuing in order one-by-one for each
+occurrence of :attr:`mask` being True.
+The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
+with the shape of the underlying tensor. The :attr:`source` should have at least
+as many elements as the number of ones in :attr:`mask`.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    source (Tensor): the tensor to copy from
+
+.. note::
+
+    The :attr:`mask` operates on the :attr:`self` tensor, not on the given
+    :attr:`source` tensor.
+
+Example:
+
+    >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter_(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "masked_fill_",
+    r"""
+masked_fill_(mask, value)
+
+Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+True. The shape of :attr:`mask` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    value (float): the value to fill in with
+""",
+)
+
+add_docstr_all(
+    "masked_select",
+    r"""
+masked_select(mask) -> Tensor
+
+See :func:`torch.masked_select`
+""",
+)
+
+add_docstr_all(
+    "matrix_power",
+    r"""
+matrix_power(n) -> Tensor
+
+.. note:: :meth:`~Tensor.matrix_power` is deprecated, use :func:`torch.linalg.matrix_power` instead.
+
+Alias for :func:`torch.linalg.matrix_power`
+""",
+)
+
+add_docstr_all(
+    "matrix_exp",
+    r"""
+matrix_exp() -> Tensor
+
+See :func:`torch.matrix_exp`
+""",
+)
+
+add_docstr_all(
+    "max",
+    r"""
+max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.max`
+""",
+)
+
+add_docstr_all(
+    "amax",
+    r"""
+amax(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amax`
+""",
+)
+
+add_docstr_all(
+    "maximum",
+    r"""
+maximum(other) -> Tensor
+
+See :func:`torch.maximum`
+""",
+)
+
+add_docstr_all(
+    "fmax",
+    r"""
+fmax(other) -> Tensor
+
+See :func:`torch.fmax`
+""",
+)
+
+add_docstr_all(
+    "argmax",
+    r"""
+argmax(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmax`
+""",
+)
+
+add_docstr_all(
+    "argwhere",
+    r"""
+argwhere() -> Tensor
+
+See :func:`torch.argwhere`
+""",
+)
+
+add_docstr_all(
+    "mean",
+    r"""
+mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.mean`
+""",
+)
+
+add_docstr_all(
+    "nanmean",
+    r"""
+nanmean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.nanmean`
+""",
+)
+
+add_docstr_all(
+    "median",
+    r"""
+median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.median`
+""",
+)
+
+add_docstr_all(
+    "nanmedian",
+    r"""
+nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.nanmedian`
+""",
+)
+
+add_docstr_all(
+    "min",
+    r"""
+min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.min`
+""",
+)
+
+add_docstr_all(
+    "amin",
+    r"""
+amin(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amin`
+""",
+)
+
+add_docstr_all(
+    "minimum",
+    r"""
+minimum(other) -> Tensor
+
+See :func:`torch.minimum`
+""",
+)
+
+add_docstr_all(
+    "aminmax",
+    r"""
+aminmax(*, dim=None, keepdim=False) -> (Tensor min, Tensor max)
+
+See :func:`torch.aminmax`
+""",
+)
+
+add_docstr_all(
+    "fmin",
+    r"""
+fmin(other) -> Tensor
+
+See :func:`torch.fmin`
+""",
+)
+
+add_docstr_all(
+    "argmin",
+    r"""
+argmin(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmin`
+""",
+)
+
+add_docstr_all(
+    "mm",
+    r"""
+mm(mat2) -> Tensor
+
+See :func:`torch.mm`
+""",
+)
+
+add_docstr_all(
+    "mode",
+    r"""
+mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.mode`
+""",
+)
+
+add_docstr_all(
+    "movedim",
+    r"""
+movedim(source, destination) -> Tensor
+
+See :func:`torch.movedim`
+""",
+)
+
+add_docstr_all(
+    "moveaxis",
+    r"""
+moveaxis(source, destination) -> Tensor
+
+See :func:`torch.moveaxis`
+""",
+)
+
+add_docstr_all(
+    "mul",
+    r"""
+mul(value) -> Tensor
+
+See :func:`torch.mul`.
+""",
+)
+
+add_docstr_all(
+    "mul_",
+    r"""
+mul_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.mul`.
+""",
+)
+
+add_docstr_all(
+    "multiply",
+    r"""
+multiply(value) -> Tensor
+
+See :func:`torch.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multiply_",
+    r"""
+multiply_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multinomial",
+    r"""
+multinomial(num_samples, replacement=False, *, generator=None) -> Tensor
+
+See :func:`torch.multinomial`
+""",
+)
+
+add_docstr_all(
+    "mv",
+    r"""
+mv(vec) -> Tensor
+
+See :func:`torch.mv`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma",
+    r"""
+mvlgamma(p) -> Tensor
+
+See :func:`torch.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma_",
+    r"""
+mvlgamma_(p) -> Tensor
+
+In-place version of :meth:`~Tensor.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "narrow",
+    r"""
+narrow(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow`.
+""",
+)
+
+add_docstr_all(
+    "narrow_copy",
+    r"""
+narrow_copy(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow_copy`.
+""",
+)
+
+add_docstr_all(
+    "ndimension",
+    r"""
+ndimension() -> int
+
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "nan_to_num",
+    r"""
+nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+See :func:`torch.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "nan_to_num_",
+    r"""
+nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+In-place version of :meth:`~Tensor.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "ne",
+    r"""
+ne(other) -> Tensor
+
+See :func:`torch.ne`.
+""",
+)
+
+add_docstr_all(
+    "ne_",
+    r"""
+ne_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ne`.
+""",
+)
+
+add_docstr_all(
+    "not_equal",
+    r"""
+not_equal(other) -> Tensor
+
+See :func:`torch.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "not_equal_",
+    r"""
+not_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "neg",
+    r"""
+neg() -> Tensor
+
+See :func:`torch.neg`
+""",
+)
+
+add_docstr_all(
+    "negative",
+    r"""
+negative() -> Tensor
+
+See :func:`torch.negative`
+""",
+)
+
+add_docstr_all(
+    "neg_",
+    r"""
+neg_() -> Tensor
+
+In-place version of :meth:`~Tensor.neg`
+""",
+)
+
+add_docstr_all(
+    "negative_",
+    r"""
+negative_() -> Tensor
+
+In-place version of :meth:`~Tensor.negative`
+""",
+)
+
+add_docstr_all(
+    "nelement",
+    r"""
+nelement() -> int
+
+Alias for :meth:`~Tensor.numel`
+""",
+)
+
+add_docstr_all(
+    "nextafter",
+    r"""
+nextafter(other) -> Tensor
+See :func:`torch.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nextafter_",
+    r"""
+nextafter_(other) -> Tensor
+In-place version of :meth:`~Tensor.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nonzero",
+    r"""
+nonzero() -> LongTensor
+
+See :func:`torch.nonzero`
+""",
+)
+
+add_docstr_all(
+    "nonzero_static",
+    r"""
+nonzero_static(input, *, size, fill_value=-1) -> Tensor
+
+Returns a 2-D tensor where each row is the index for a non-zero value.
+The returned Tensor has the same `torch.dtype` as `torch.nonzero()`.
+
+Args:
+    input (Tensor): the input tensor to count non-zero elements.
+
+Keyword args:
+    size (int): the size of non-zero elements expected to be included in the out
+        tensor. Pad the out tensor with `fill_value` if the `size` is larger
+        than total number of non-zero elements, truncate out tensor if `size`
+        is smaller. The size must be a non-negative integer.
+    fill_value (int): the value to fill the output tensor with when `size` is larger
+        than the total number of non-zero elements. Default is `-1` to represent
+        invalid index.
+
+Example:
+
+    # Example 1: Padding
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 4
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0],
+            [  1,   1],
+            [  -1, -1]], dtype=torch.int64)
+
+    # Example 2: Truncating
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0]], dtype=torch.int64)
+
+    # Example 3: 0 size
+    >>> input_tensor = torch.tensor([10])
+    >>> static_size = 0
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(0, 1), dtype=torch.int64)
+
+    # Example 4: 0 rank input
+    >>> input_tensor = torch.tensor(10)
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(2, 0), dtype=torch.int64)
+""",
+)
+
+add_docstr_all(
+    "norm",
+    r"""
+norm(p=2, dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.norm`
+""",
+)
+
+add_docstr_all(
+    "normal_",
+    r"""
+normal_(mean=0, std=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements samples from the normal distribution
+parameterized by :attr:`mean` and :attr:`std`.
+""",
+)
+
+add_docstr_all(
+    "numel",
+    r"""
+numel() -> int
+
+See :func:`torch.numel`
+""",
+)
+
+add_docstr_all(
+    "numpy",
+    r"""
+numpy(*, force=False) -> numpy.ndarray
+
+Returns the tensor as a NumPy :class:`ndarray`.
+
+If :attr:`force` is ``False`` (the default), the conversion
+is performed only if the tensor is on the CPU, does not require grad,
+does not have its conjugate bit set, and is a dtype and layout that
+NumPy supports. The returned ndarray and the tensor will share their
+storage, so changes to the tensor will be reflected in the ndarray
+and vice versa.
+
+If :attr:`force` is ``True`` this is equivalent to
+calling ``t.detach().cpu().resolve_conj().resolve_neg().numpy()``.
+If the tensor isn't on the CPU or the conjugate or negative bit is set,
+the tensor won't share its storage with the returned ndarray.
+Setting :attr:`force` to ``True`` can be a useful shorthand.
+
+Args:
+    force (bool): if ``True``, the ndarray may be a copy of the tensor
+               instead of always sharing memory, defaults to ``False``.
+""",
+)
+
+add_docstr_all(
+    "orgqr",
+    r"""
+orgqr(input2) -> Tensor
+
+See :func:`torch.orgqr`
+""",
+)
+
+add_docstr_all(
+    "ormqr",
+    r"""
+ormqr(input2, input3, left=True, transpose=False) -> Tensor
+
+See :func:`torch.ormqr`
+""",
+)
+
+add_docstr_all(
+    "permute",
+    r"""
+permute(*dims) -> Tensor
+
+See :func:`torch.permute`
+""",
+)
+
+add_docstr_all(
+    "polygamma",
+    r"""
+polygamma(n) -> Tensor
+
+See :func:`torch.polygamma`
+""",
+)
+
+add_docstr_all(
+    "polygamma_",
+    r"""
+polygamma_(n) -> Tensor
+
+In-place version of :meth:`~Tensor.polygamma`
+""",
+)
+
+add_docstr_all(
+    "positive",
+    r"""
+positive() -> Tensor
+
+See :func:`torch.positive`
+""",
+)
+
+add_docstr_all(
+    "pow",
+    r"""
+pow(exponent) -> Tensor
+
+See :func:`torch.pow`
+""",
+)
+
+add_docstr_all(
+    "pow_",
+    r"""
+pow_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.pow`
+""",
+)
+
+add_docstr_all(
+    "float_power",
+    r"""
+float_power(exponent) -> Tensor
+
+See :func:`torch.float_power`
+""",
+)
+
+add_docstr_all(
+    "float_power_",
+    r"""
+float_power_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.float_power`
+""",
+)
+
+add_docstr_all(
+    "prod",
+    r"""
+prod(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.prod`
+""",
+)
+
+add_docstr_all(
+    "put_",
+    r"""
+put_(index, source, accumulate=False) -> Tensor
+
+Copies the elements from :attr:`source` into the positions specified by
+:attr:`index`. For the purpose of indexing, the :attr:`self` tensor is treated as if
+it were a 1-D tensor.
+
+:attr:`index` and :attr:`source` need to have the same number of elements, but not necessarily
+the same shape.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`source` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if :attr:`index`
+contain duplicate elements.
+
+Args:
+    index (LongTensor): the indices into self
+    source (Tensor): the tensor containing values to copy from
+    accumulate (bool): whether to accumulate into self
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+    ...                     [6, 7, 8]])
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+    tensor([[  4,   9,   5],
+            [ 10,   7,   8]])
+""",
+)
+
+add_docstr_all(
+    "put",
+    r"""
+put(input, index, source, accumulate=False) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.put_`.
+`input` corresponds to `self` in :meth:`torch.Tensor.put_`.
+""",
+)
+
+add_docstr_all(
+    "qr",
+    r"""
+qr(some=True) -> (Tensor, Tensor)
+
+See :func:`torch.qr`
+""",
+)
+
+add_docstr_all(
+    "qscheme",
+    r"""
+qscheme() -> torch.qscheme
+
+Returns the quantization scheme of a given QTensor.
+""",
+)
+
+add_docstr_all(
+    "quantile",
+    r"""
+quantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.quantile`
+""",
+)
+
+add_docstr_all(
+    "nanquantile",
+    r"""
+nanquantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.nanquantile`
+""",
+)
+
+add_docstr_all(
+    "q_scale",
+    r"""
+q_scale() -> float
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the scale of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_zero_point",
+    r"""
+q_zero_point() -> int
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the zero_point of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_scales",
+    r"""
+q_per_channel_scales() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a Tensor of scales of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_zero_points",
+    r"""
+q_per_channel_zero_points() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a tensor of zero_points of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_axis",
+    r"""
+q_per_channel_axis() -> int
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns the index of dimension on which per-channel quantization is applied.
+""",
+)
+
+add_docstr_all(
+    "random_",
+    r"""
+random_(from=0, to=None, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+distribution over ``[from, to - 1]``. If not specified, the values are usually
+only bounded by :attr:`self` tensor's data type. However, for floating point
+types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+will be uniform in ``[0, 2^53]``.
+""",
+)
+
+add_docstr_all(
+    "rad2deg",
+    r"""
+rad2deg() -> Tensor
+
+See :func:`torch.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "rad2deg_",
+    r"""
+rad2deg_() -> Tensor
+
+In-place version of :meth:`~Tensor.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "deg2rad",
+    r"""
+deg2rad() -> Tensor
+
+See :func:`torch.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "deg2rad_",
+    r"""
+deg2rad_() -> Tensor
+
+In-place version of :meth:`~Tensor.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "ravel",
+    r"""
+ravel() -> Tensor
+
+see :func:`torch.ravel`
+""",
+)
+
+add_docstr_all(
+    "reciprocal",
+    r"""
+reciprocal() -> Tensor
+
+See :func:`torch.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "reciprocal_",
+    r"""
+reciprocal_() -> Tensor
+
+In-place version of :meth:`~Tensor.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "record_stream",
+    r"""
+record_stream(stream)
+
+Marks the tensor as having been used by this stream.  When the tensor
+is deallocated, ensure the tensor memory is not reused for another tensor
+until all work queued on :attr:`stream` at the time of deallocation is
+complete.
+
+.. note::
+
+    The caching allocator is aware of only the stream where a tensor was
+    allocated. Due to the awareness, it already correctly manages the life
+    cycle of tensors on only one stream. But if a tensor is used on a stream
+    different from the stream of origin, the allocator might reuse the memory
+    unexpectedly. Calling this method lets the allocator know which streams
+    have used the tensor.
+
+.. warning::
+
+    This method is most suitable for use cases where you are providing a
+    function that created a tensor on a side stream, and want users to be able
+    to make use of the tensor without having to think carefully about stream
+    safety when making use of them.  These safety guarantees come at some
+    performance and predictability cost (analogous to the tradeoff between GC
+    and manual memory management), so if you are in a situation where
+    you manage the full lifetime of your tensors, you may consider instead
+    manually managing CUDA events so that calling this method is not necessary.
+    In particular, when you call this method, on later allocations the
+    allocator will poll the recorded stream to see if all operations have
+    completed yet; you can potentially race with side stream computation and
+    non-deterministically reuse or fail to reuse memory for an allocation.
+
+    You can safely use tensors allocated on side streams without
+    :meth:`~Tensor.record_stream`; you must manually ensure that
+    any non-creation stream uses of a tensor are synced back to the creation
+    stream before you deallocate the tensor.  As the CUDA caching allocator
+    guarantees that the memory will only be reused with the same creation stream,
+    this is sufficient to ensure that writes to future reallocations of the
+    memory will be delayed until non-creation stream uses are done.
+    (Counterintuitively, you may observe that on the CPU side we have already
+    reallocated the tensor, even though CUDA kernels on the old tensor are
+    still in progress.  This is fine, because CUDA operations on the new
+    tensor will appropriately wait for the old operations to complete, as they
+    are all on the same stream.)
+
+    Concretely, this looks like this::
+
+        with torch.cuda.stream(s0):
+            x = torch.zeros(N)
+
+        s1.wait_stream(s0)
+        with torch.cuda.stream(s1):
+            y = some_comm_op(x)
+
+        ... some compute on s0 ...
+
+        # synchronize creation stream s0 to side stream s1
+        # before deallocating x
+        s0.wait_stream(s1)
+        del x
+
+    Note that some discretion is required when deciding when to perform
+    ``s0.wait_stream(s1)``.  In particular, if we were to wait immediately
+    after ``some_comm_op``, there wouldn't be any point in having the side
+    stream; it would be equivalent to have run ``some_comm_op`` on ``s0``.
+    Instead, the synchronization must be placed at some appropriate, later
+    point in time where you expect the side stream ``s1`` to have finished
+    work.  This location is typically identified via profiling, e.g., using
+    Chrome traces produced
+    :meth:`torch.autograd.profiler.profile.export_chrome_trace`.  If you
+    place the wait too early, work on s0 will block until ``s1`` has finished,
+    preventing further overlapping of communication and computation.  If you
+    place the wait too late, you will use more memory than is strictly
+    necessary (as you are keeping ``x`` live for longer.)  For a concrete
+    example of how this guidance can be applied in practice, see this post:
+    `FSDP and CUDACachingAllocator
+    <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486>`_.
+""",
+)
+
+add_docstr_all(
+    "remainder",
+    r"""
+remainder(divisor) -> Tensor
+
+See :func:`torch.remainder`
+""",
+)
+
+add_docstr_all(
+    "remainder_",
+    r"""
+remainder_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.remainder`
+""",
+)
+
+add_docstr_all(
+    "renorm",
+    r"""
+renorm(p, dim, maxnorm) -> Tensor
+
+See :func:`torch.renorm`
+""",
+)
+
+add_docstr_all(
+    "renorm_",
+    r"""
+renorm_(p, dim, maxnorm) -> Tensor
+
+In-place version of :meth:`~Tensor.renorm`
+""",
+)
+
+add_docstr_all(
+    "repeat",
+    r"""
+repeat(*sizes) -> Tensor
+
+Repeats this tensor along the specified dimensions.
+
+Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+
+.. warning::
+
+    :meth:`~Tensor.repeat` behaves differently from
+    `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
+    but is more similar to
+    `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+    For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
+
+Args:
+    sizes (torch.Size or int...): The number of times to repeat this tensor along each
+        dimension
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.repeat(4, 2)
+    tensor([[ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3]])
+    >>> x.repeat(4, 2, 1).size()
+    torch.Size([4, 2, 3])
+""",
+)
+
+add_docstr_all(
+    "repeat_interleave",
+    r"""
+repeat_interleave(repeats, dim=None, *, output_size=None) -> Tensor
+
+See :func:`torch.repeat_interleave`.
+""",
+)
+
+add_docstr_all(
+    "requires_grad_",
+    r"""
+requires_grad_(requires_grad=True) -> Tensor
+
+Change if autograd should record operations on this tensor: sets this tensor's
+:attr:`requires_grad` attribute in-place. Returns this tensor.
+
+:func:`requires_grad_`'s main use case is to tell autograd to begin recording
+operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+(because it was obtained through a DataLoader, or required preprocessing or
+initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+begin to record operations on ``tensor``.
+
+Args:
+    requires_grad (bool): If autograd should record operations on this tensor.
+        Default: ``True``.
+
+Example::
+
+    >>> # Let's say we want to preprocess some saved weights and use
+    >>> # the result as new weights.
+    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+    >>> loaded_weights = torch.tensor(saved_weights)
+    >>> weights = preprocess(loaded_weights)  # some function
+    >>> weights
+    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+
+    >>> # Now, start to record operations done to weights
+    >>> weights.requires_grad_()
+    >>> out = weights.pow(2).sum()
+    >>> out.backward()
+    >>> weights.grad
+    tensor([-1.1007,  0.9853, -4.2316, -1.6606])
+
+""",
+)
+
+add_docstr_all(
+    "reshape",
+    r"""
+reshape(*shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`self`
+but with the specified shape. This method returns a view if :attr:`shape` is
+compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+possible to return a view.
+
+See :func:`torch.reshape`
+
+Args:
+    shape (tuple of ints or int...): the desired shape
+
+""",
+)
+
+add_docstr_all(
+    "reshape_as",
+    r"""
+reshape_as(other) -> Tensor
+
+Returns this tensor as the same shape as :attr:`other`.
+``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+This method returns a view if ``other.sizes()`` is compatible with the current
+shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
+Please see :meth:`reshape` for more information about ``reshape``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same shape
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "resize_",
+    r"""
+resize_(*sizes, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes :attr:`self` tensor to the specified size. If the number of elements is
+larger than the current storage size, then the underlying storage is resized
+to fit the new number of elements. If the number of elements is smaller, the
+underlying storage is not changed. Existing elements are preserved but any new
+memory is uninitialized.
+
+.. warning::
+
+    This is a low-level method. The storage is reinterpreted as C-contiguous,
+    ignoring the current strides (unless the target size equals the current
+    size, in which case the tensor is left unchanged). For most purposes, you
+    will instead want to use :meth:`~Tensor.view()`, which checks for
+    contiguity, or :meth:`~Tensor.reshape()`, which copies data if needed. To
+    change the size in-place with custom strides, see :meth:`~Tensor.set_()`.
+
+.. note::
+
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, new elements are initialized to prevent nondeterministic behavior
+    from using the result as an input to an operation. Floating point and
+    complex values are set to NaN, and integer values are set to the maximum
+    value.
+
+Args:
+    sizes (torch.Size or int...): the desired size
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``sizes``.
+
+Example::
+
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x.resize_(2, 2)
+    tensor([[ 1,  2],
+            [ 3,  4]])
+""",
+)
+
+add_docstr_all(
+    "resize_as_",
+    r"""
+resize_as_(tensor, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes the :attr:`self` tensor to be the same size as the specified
+:attr:`tensor`. This is equivalent to ``self.resize_(tensor.size())``.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``tensor.size()``.
+
+""",
+)
+
+add_docstr_all(
+    "rot90",
+    r"""
+rot90(k, dims) -> Tensor
+
+See :func:`torch.rot90`
+""",
+)
+
+add_docstr_all(
+    "round",
+    r"""
+round(decimals=0) -> Tensor
+
+See :func:`torch.round`
+""",
+)
+
+add_docstr_all(
+    "round_",
+    r"""
+round_(decimals=0) -> Tensor
+
+In-place version of :meth:`~Tensor.round`
+""",
+)
+
+add_docstr_all(
+    "rsqrt",
+    r"""
+rsqrt() -> Tensor
+
+See :func:`torch.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "rsqrt_",
+    r"""
+rsqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "scatter_",
+    r"""
+scatter_(dim, index, src, *, reduce=None) -> Tensor
+
+Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive.
+
+.. warning::
+
+    When indices are not unique, the behavior is non-deterministic (one of the
+    values from ``src`` will be picked arbitrarily) and the gradient will be
+    incorrect (it will be propagated to all locations in the source that
+    correspond to the same index)!
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Additionally accepts an optional :attr:`reduce` argument that allows
+specification of an optional reduction operation, which is applied to all
+values in the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+operation is applied to an index in :attr:`self` which is specified by
+its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+value in :attr:`index` for ``dimension = dim``.
+
+Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+is updated as::
+
+    self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+
+Reducing with the addition operation is the same as using
+:meth:`~torch.Tensor.scatter_add_`.
+
+.. warning::
+    The reduce argument with Tensor ``src`` is deprecated and will be removed in
+    a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+    instead for more reduction options.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    src (Tensor): the source element(s) to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> src = torch.arange(1, 11).reshape((2, 5))
+    >>> src
+    tensor([[ 1,  2,  3,  4,  5],
+            [ 6,  7,  8,  9, 10]])
+    >>> index = torch.tensor([[0, 1, 2, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+    tensor([[1, 0, 0, 4, 0],
+            [0, 2, 0, 0, 0],
+            [0, 0, 3, 0, 0]])
+    >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+    tensor([[1, 2, 3, 0, 0],
+            [6, 7, 0, 0, 8],
+            [0, 0, 0, 0, 0]])
+
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='multiply')
+    tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+            [2.0000, 2.0000, 2.0000, 2.4600]])
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='add')
+    tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+            [2.0000, 2.0000, 2.0000, 3.2300]])
+
+.. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+   :noindex:
+
+Writes the value from :attr:`value` into :attr:`self` at the indices
+specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+with the :attr:`src` tensor filled entirely with :attr:`value`.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    value (Scalar): the value to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> index = torch.tensor([[0, 1]])
+    >>> value = 2
+    >>> torch.zeros(3, 5).scatter_(0, index, value)
+    tensor([[2., 0., 0., 0., 0.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 0., 0., 0.]])
+""",
+)
+
+add_docstr_all(
+    "scatter_add_",
+    r"""
+scatter_add_(dim, index, src) -> Tensor
+
+Adds all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`src`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`src`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`src` should have same number of
+dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``. Note that ``index`` and ``src`` do not broadcast.
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add, can be
+        either empty or of the same dimensionality as ``src``. When empty, the
+        operation returns ``self`` unchanged.
+    src (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> src = torch.ones((2, 5))
+    >>> index = torch.tensor([[0, 1, 2, 0, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[1., 0., 0., 1., 1.],
+            [0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 0.]])
+    >>> index = torch.tensor([[0, 1, 2, 0, 0], [0, 1, 2, 2, 2]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[2., 0., 0., 1., 1.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 2., 1., 1.]])
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "scatter_reduce_",
+    r"""
+scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Reduces all values from the :attr:`src` tensor to the indices specified in
+the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction
+defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`,
+:obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an
+index in :attr:`self` which is specified by its index in :attr:`src` for
+``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self`
+tensor are included in the reduction.
+
+:attr:`self`, :attr:`index` and :attr:`src` should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and reduce.
+    src (Tensor): the source elements to scatter and reduce
+    reduce (str): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+    include_self (bool): whether elements from the :attr:`self` tensor are
+        included in the reduction
+
+Example::
+
+    >>> src = torch.tensor([1., 2., 3., 4., 5., 6.])
+    >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+    >>> input = torch.tensor([1., 2., 3., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum")
+    tensor([5., 14., 8., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False)
+    tensor([4., 12., 5., 4.])
+    >>> input2 = torch.tensor([5., 4., 3., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax")
+    tensor([5., 6., 5., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False)
+    tensor([3., 6., 5., 2.])
+
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "select",
+    r"""
+select(dim, index) -> Tensor
+
+See :func:`torch.select`
+""",
+)
+
+add_docstr_all(
+    "select_scatter",
+    r"""
+select_scatter(src, dim, index) -> Tensor
+
+See :func:`torch.select_scatter`
+""",
+)
+
+add_docstr_all(
+    "slice_scatter",
+    r"""
+slice_scatter(src, dim=0, start=None, end=None, step=1) -> Tensor
+
+See :func:`torch.slice_scatter`
+""",
+)
+
+add_docstr_all(
+    "set_",
+    r"""
+set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+
+Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+:attr:`self` tensor will share the same storage and have the same size and
+strides as :attr:`source`. Changes to elements in one tensor will be reflected
+in the other.
+
+If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+storage, offset, size, and stride.
+
+Args:
+    source (Tensor or Storage): the tensor or storage to use
+    storage_offset (int, optional): the offset in the storage
+    size (torch.Size, optional): the desired size. Defaults to the size of the source.
+    stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+""",
+)
+
+add_docstr_all(
+    "sigmoid",
+    r"""
+sigmoid() -> Tensor
+
+See :func:`torch.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "sigmoid_",
+    r"""
+sigmoid_() -> Tensor
+
+In-place version of :meth:`~Tensor.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "logit",
+    r"""
+logit() -> Tensor
+
+See :func:`torch.logit`
+""",
+)
+
+add_docstr_all(
+    "logit_",
+    r"""
+logit_() -> Tensor
+
+In-place version of :meth:`~Tensor.logit`
+""",
+)
+
+add_docstr_all(
+    "sign",
+    r"""
+sign() -> Tensor
+
+See :func:`torch.sign`
+""",
+)
+
+add_docstr_all(
+    "sign_",
+    r"""
+sign_() -> Tensor
+
+In-place version of :meth:`~Tensor.sign`
+""",
+)
+
+add_docstr_all(
+    "signbit",
+    r"""
+signbit() -> Tensor
+
+See :func:`torch.signbit`
+""",
+)
+
+add_docstr_all(
+    "sgn",
+    r"""
+sgn() -> Tensor
+
+See :func:`torch.sgn`
+""",
+)
+
+add_docstr_all(
+    "sgn_",
+    r"""
+sgn_() -> Tensor
+
+In-place version of :meth:`~Tensor.sgn`
+""",
+)
+
+add_docstr_all(
+    "sin",
+    r"""
+sin() -> Tensor
+
+See :func:`torch.sin`
+""",
+)
+
+add_docstr_all(
+    "sin_",
+    r"""
+sin_() -> Tensor
+
+In-place version of :meth:`~Tensor.sin`
+""",
+)
+
+add_docstr_all(
+    "sinc",
+    r"""
+sinc() -> Tensor
+
+See :func:`torch.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinc_",
+    r"""
+sinc_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinh",
+    r"""
+sinh() -> Tensor
+
+See :func:`torch.sinh`
+""",
+)
+
+add_docstr_all(
+    "sinh_",
+    r"""
+sinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinh`
+""",
+)
+
+add_docstr_all(
+    "size",
+    r"""
+size(dim=None) -> torch.Size or int
+
+Returns the size of the :attr:`self` tensor. If ``dim`` is not specified,
+the returned value is a :class:`torch.Size`, a subclass of :class:`tuple`.
+If ``dim`` is specified, returns an int holding the size of that dimension.
+
+Args:
+  dim (int, optional): The dimension for which to retrieve the size.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.size(dim=1)
+    4
+
+""",
+)
+
+add_docstr_all(
+    "shape",
+    r"""
+shape() -> torch.Size
+
+Returns the size of the :attr:`self` tensor. Alias for :attr:`size`.
+
+See also :meth:`Tensor.size`.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.shape
+    torch.Size([3, 4, 5])
+
+""",
+)
+
+add_docstr_all(
+    "sort",
+    r"""
+sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+
+See :func:`torch.sort`
+""",
+)
+
+add_docstr_all(
+    "msort",
+    r"""
+msort() -> Tensor
+
+See :func:`torch.msort`
+""",
+)
+
+add_docstr_all(
+    "argsort",
+    r"""
+argsort(dim=-1, descending=False) -> LongTensor
+
+See :func:`torch.argsort`
+""",
+)
+
+add_docstr_all(
+    "sparse_dim",
+    r"""
+sparse_dim() -> int
+
+Return the number of sparse dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+
+.. note::
+  Returns ``0`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.dense_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_",
+    r"""
+sparse_resize_(size, sparse_dim, dense_dim) -> Tensor
+
+Resizes :attr:`self` :ref:`sparse tensor <sparse-docs>` to the desired
+size and the number of sparse and dense dimensions.
+
+.. note::
+  If the number of specified elements in :attr:`self` is zero, then
+  :attr:`size`, :attr:`sparse_dim`, and :attr:`dense_dim` can be any
+  size and positive integers such that ``len(size) == sparse_dim +
+  dense_dim``.
+
+  If :attr:`self` specifies one or more elements, however, then each
+  dimension in :attr:`size` must not be smaller than the corresponding
+  dimension of :attr:`self`, :attr:`sparse_dim` must equal the number
+  of sparse dimensions in :attr:`self`, and :attr:`dense_dim` must
+  equal the number of dense dimensions in :attr:`self`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size. If :attr:`self` is non-empty
+      sparse tensor, the desired size cannot be smaller than the
+      original size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_and_clear_",
+    r"""
+sparse_resize_and_clear_(size, sparse_dim, dense_dim) -> Tensor
+
+Removes all specified elements from a :ref:`sparse tensor
+<sparse-docs>` :attr:`self` and resizes :attr:`self` to the desired
+size and the number of sparse and dense dimensions.
+
+.. warning:
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sqrt",
+    r"""
+sqrt() -> Tensor
+
+See :func:`torch.sqrt`
+""",
+)
+
+add_docstr_all(
+    "sqrt_",
+    r"""
+sqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.sqrt`
+""",
+)
+
+add_docstr_all(
+    "square",
+    r"""
+square() -> Tensor
+
+See :func:`torch.square`
+""",
+)
+
+add_docstr_all(
+    "square_",
+    r"""
+square_() -> Tensor
+
+In-place version of :meth:`~Tensor.square`
+""",
+)
+
+add_docstr_all(
+    "squeeze",
+    r"""
+squeeze(dim=None) -> Tensor
+
+See :func:`torch.squeeze`
+""",
+)
+
+add_docstr_all(
+    "squeeze_",
+    r"""
+squeeze_(dim=None) -> Tensor
+
+In-place version of :meth:`~Tensor.squeeze`
+""",
+)
+
+add_docstr_all(
+    "std",
+    r"""
+std(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.std`
+""",
+)
+
+add_docstr_all(
+    "storage_offset",
+    r"""
+storage_offset() -> int
+
+Returns :attr:`self` tensor's offset in the underlying storage in terms of
+number of storage elements (not bytes).
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
+    >>> x.storage_offset()
+    0
+    >>> x[3:].storage_offset()
+    3
+
+""",
+)
+
+add_docstr_all(
+    "untyped_storage",
+    r"""
+untyped_storage() -> torch.UntypedStorage
+
+Returns the underlying :class:`UntypedStorage`.
+""",
+)
+
+add_docstr_all(
+    "stride",
+    r"""
+stride(dim) -> tuple or int
+
+Returns the stride of :attr:`self` tensor.
+
+Stride is the jump necessary to go from one element to the next one in the
+specified dimension :attr:`dim`. A tuple of all strides is returned when no
+argument is passed in. Otherwise, an integer value is returned as the stride in
+the particular dimension :attr:`dim`.
+
+Args:
+    dim (int, optional): the desired dimension in which stride is required
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+    >>> x.stride(0)
+    5
+    >>> x.stride(-1)
+    1
+
+""",
+)
+
+add_docstr_all(
+    "sub",
+    r"""
+sub(other, *, alpha=1) -> Tensor
+
+See :func:`torch.sub`.
+""",
+)
+
+add_docstr_all(
+    "sub_",
+    r"""
+sub_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.sub`
+""",
+)
+
+add_docstr_all(
+    "subtract",
+    r"""
+subtract(other, *, alpha=1) -> Tensor
+
+See :func:`torch.subtract`.
+""",
+)
+
+add_docstr_all(
+    "subtract_",
+    r"""
+subtract_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.subtract`.
+""",
+)
+
+add_docstr_all(
+    "sum",
+    r"""
+sum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.sum`
+""",
+)
+
+add_docstr_all(
+    "nansum",
+    r"""
+nansum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.nansum`
+""",
+)
+
+add_docstr_all(
+    "svd",
+    r"""
+svd(some=True, compute_uv=True) -> (Tensor, Tensor, Tensor)
+
+See :func:`torch.svd`
+""",
+)
+
+add_docstr_all(
+    "swapdims",
+    r"""
+swapdims(dim0, dim1) -> Tensor
+
+See :func:`torch.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapdims_",
+    r"""
+swapdims_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapaxes",
+    r"""
+swapaxes(axis0, axis1) -> Tensor
+
+See :func:`torch.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "swapaxes_",
+    r"""
+swapaxes_(axis0, axis1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "t",
+    r"""
+t() -> Tensor
+
+See :func:`torch.t`
+""",
+)
+
+add_docstr_all(
+    "t_",
+    r"""
+t_() -> Tensor
+
+In-place version of :meth:`~Tensor.t`
+""",
+)
+
+add_docstr_all(
+    "tile",
+    r"""
+tile(dims) -> Tensor
+
+See :func:`torch.tile`
+""",
+)
+
+add_docstr_all(
+    "to",
+    r"""
+to(*args, **kwargs) -> Tensor
+
+Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+inferred from the arguments of ``self.to(*args, **kwargs)``.
+
+.. note::
+
+    If the ``self`` Tensor already
+    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+    Otherwise, the returned tensor is a copy of ``self`` with the desired
+    :class:`torch.dtype` and :class:`torch.device`.
+
+Here are the ways to call ``to``:
+
+.. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`dtype`
+
+    Args:
+        {memory_format}
+
+.. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`device` and (optional)
+    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+    When :attr:`non_blocking`, tries to convert asynchronously with respect to
+    the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+    CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+    Args:
+        {memory_format}
+
+.. method:: to(other, non_blocking=False, copy=False) -> Tensor
+   :noindex:
+
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+    the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+    asynchronously with respect to the host if possible, e.g., converting a CPU
+    Tensor with pinned memory to a CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+Example::
+
+    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+    >>> tensor.to(torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64)
+
+    >>> cuda0 = torch.device('cuda:0')
+    >>> tensor.to(cuda0)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], device='cuda:0')
+
+    >>> tensor.to(cuda0, dtype=torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+    >>> tensor.to(other, non_blocking=True)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "byte",
+    r"""
+byte(memory_format=torch.preserve_format) -> Tensor
+
+``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bool",
+    r"""
+bool(memory_format=torch.preserve_format) -> Tensor
+
+``self.bool()`` is equivalent to ``self.to(torch.bool)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "char",
+    r"""
+char(memory_format=torch.preserve_format) -> Tensor
+
+``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bfloat16",
+    r"""
+bfloat16(memory_format=torch.preserve_format) -> Tensor
+``self.bfloat16()`` is equivalent to ``self.to(torch.bfloat16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "double",
+    r"""
+double(memory_format=torch.preserve_format) -> Tensor
+
+``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "float",
+    r"""
+float(memory_format=torch.preserve_format) -> Tensor
+
+``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cdouble",
+    r"""
+cdouble(memory_format=torch.preserve_format) -> Tensor
+
+``self.cdouble()`` is equivalent to ``self.to(torch.complex128)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cfloat",
+    r"""
+cfloat(memory_format=torch.preserve_format) -> Tensor
+
+``self.cfloat()`` is equivalent to ``self.to(torch.complex64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "chalf",
+    r"""
+chalf(memory_format=torch.preserve_format) -> Tensor
+
+``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`.
+
+Args:
+     {memory_format}
+ """.format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "half",
+    r"""
+half(memory_format=torch.preserve_format) -> Tensor
+
+``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int",
+    r"""
+int(memory_format=torch.preserve_format) -> Tensor
+
+``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int_repr",
+    r"""
+int_repr() -> Tensor
+
+Given a quantized Tensor,
+``self.int_repr()`` returns a CPU Tensor with uint8_t as data type that stores the
+underlying uint8_t values of the given Tensor.
+""",
+)
+
+
+add_docstr_all(
+    "long",
+    r"""
+long(memory_format=torch.preserve_format) -> Tensor
+
+``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "short",
+    r"""
+short(memory_format=torch.preserve_format) -> Tensor
+
+``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "take",
+    r"""
+take(indices) -> Tensor
+
+See :func:`torch.take`
+""",
+)
+
+add_docstr_all(
+    "take_along_dim",
+    r"""
+take_along_dim(indices, dim) -> Tensor
+
+See :func:`torch.take_along_dim`
+""",
+)
+
+add_docstr_all(
+    "tan",
+    r"""
+tan() -> Tensor
+
+See :func:`torch.tan`
+""",
+)
+
+add_docstr_all(
+    "tan_",
+    r"""
+tan_() -> Tensor
+
+In-place version of :meth:`~Tensor.tan`
+""",
+)
+
+add_docstr_all(
+    "tanh",
+    r"""
+tanh() -> Tensor
+
+See :func:`torch.tanh`
+""",
+)
+
+add_docstr_all(
+    "softmax",
+    r"""
+softmax(dim) -> Tensor
+
+Alias for :func:`torch.nn.functional.softmax`.
+""",
+)
+
+add_docstr_all(
+    "tanh_",
+    r"""
+tanh_() -> Tensor
+
+In-place version of :meth:`~Tensor.tanh`
+""",
+)
+
+add_docstr_all(
+    "tolist",
+    r"""
+tolist() -> list or number
+
+Returns the tensor as a (nested) list. For scalars, a standard
+Python number is returned, just like with :meth:`~Tensor.item`.
+Tensors are automatically moved to the CPU first if necessary.
+
+This operation is not differentiable.
+
+Examples::
+
+    >>> a = torch.randn(2, 2)
+    >>> a.tolist()
+    [[0.012766935862600803, 0.5415473580360413],
+     [-0.08909505605697632, 0.7729271650314331]]
+    >>> a[0,0].tolist()
+    0.012766935862600803
+""",
+)
+
+add_docstr_all(
+    "topk",
+    r"""
+topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
+
+See :func:`torch.topk`
+""",
+)
+
+add_docstr_all(
+    "to_dense",
+    r"""
+to_dense(dtype=None, *, masked_grad=True) -> Tensor
+
+Creates a strided copy of :attr:`self` if :attr:`self` is not a strided tensor, otherwise returns :attr:`self`.
+
+Keyword args:
+    {dtype}
+    masked_grad (bool, optional): If set to ``True`` (default) and
+      :attr:`self` has a sparse layout then the backward of
+      :meth:`to_dense` returns ``grad.sparse_mask(self)``.
+
+Example::
+
+    >>> s = torch.sparse_coo_tensor(
+    ...        torch.tensor([[1, 1],
+    ...                      [0, 2]]),
+    ...        torch.tensor([9, 10]),
+    ...        size=(3, 3))
+    >>> s.to_dense()
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+""",
+)
+
+add_docstr_all(
+    "to_sparse",
+    r"""
+to_sparse(sparseDims) -> Tensor
+
+Returns a sparse copy of the tensor.  PyTorch supports sparse tensors in
+:ref:`coordinate format <sparse-coo-docs>`.
+
+Args:
+    sparseDims (int, optional): the number of sparse dimensions to include in the new sparse tensor
+
+Example::
+
+    >>> d = torch.tensor([[0, 0, 0], [9, 0, 10], [0, 0, 0]])
+    >>> d
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+    >>> d.to_sparse()
+    tensor(indices=tensor([[1, 1],
+                           [0, 2]]),
+           values=tensor([ 9, 10]),
+           size=(3, 3), nnz=2, layout=torch.sparse_coo)
+    >>> d.to_sparse(1)
+    tensor(indices=tensor([[1]]),
+           values=tensor([[ 9,  0, 10]]),
+           size=(3, 3), nnz=1, layout=torch.sparse_coo)
+
+.. method:: to_sparse(*, layout=None, blocksize=None, dense_dim=None) -> Tensor
+   :noindex:
+
+Returns a sparse tensor with the specified layout and blocksize.  If
+the :attr:`self` is strided, the number of dense dimensions could be
+specified, and a hybrid sparse tensor will be created, with
+`dense_dim` dense dimensions and `self.dim() - 2 - dense_dim` batch
+dimension.
+
+.. note:: If the :attr:`self` layout and blocksize parameters match
+          with the specified layout and blocksize, return
+          :attr:`self`. Otherwise, return a sparse tensor copy of
+          :attr:`self`.
+
+Args:
+
+    layout (:class:`torch.layout`, optional): The desired sparse
+      layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+      ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+      ``torch.sparse_bsc``. Default: if ``None``,
+      ``torch.sparse_coo``.
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR or BSC tensor. For other layouts,
+      specifying the block size that is not ``None`` will result in a
+      RuntimeError exception.  A block size must be a tuple of length
+      two such that its items evenly divide the two sparse dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR, CSC, BSR or BSC tensor.  This argument should be
+      used only if :attr:`self` is a strided tensor, and must be a
+      value between 0 and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+    >>> x.to_sparse(layout=torch.sparse_coo)
+    tensor(indices=tensor([[0, 2, 2],
+                           [0, 0, 1]]),
+           values=tensor([1, 2, 3]),
+           size=(3, 2), nnz=3, layout=torch.sparse_coo)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+    tensor(crow_indices=tensor([0, 1, 1, 2]),
+           col_indices=tensor([0, 0]),
+           values=tensor([[[1, 0]],
+                          [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+    RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+    >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+    RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
+
+    >>> x = torch.tensor([[[1], [0]], [[0], [0]], [[2], [3]]])
+    >>> x.to_sparse(layout=torch.sparse_csr, dense_dim=1)
+    tensor(crow_indices=tensor([0, 1, 1, 3]),
+           col_indices=tensor([0, 0, 1]),
+           values=tensor([[1],
+                          [2],
+                          [3]]), size=(3, 2, 1), nnz=3, layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csr",
+    r"""
+to_sparse_csr(dense_dim=None) -> Tensor
+
+Convert a tensor to compressed row storage format (CSR).  Except for
+strided tensors, only works with 2D tensors.  If the :attr:`self` is
+strided, then the number of dense dimensions could be specified, and a
+hybrid CSR tensor will be created, with `dense_dim` dense dimensions
+and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csr(dense_dim=2)
+    tensor(crow_indices=tensor([0, 1, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csc",
+    r"""
+to_sparse_csc() -> Tensor
+
+Convert a tensor to compressed column storage (CSC) format.  Except
+for strided tensors, only works with 2D tensors.  If the :attr:`self`
+is strided, then the number of dense dimensions could be specified,
+and a hybrid CSC tensor will be created, with `dense_dim` dense
+dimensions and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csc()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csc(dense_dim=2)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csc)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsr",
+    r"""
+to_sparse_bsr(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse row (BSR) storage format of given
+blocksize.  If the :attr:`self` is strided, then the number of dense
+dimensions could be specified, and a hybrid BSR tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsr = sparse.to_sparse_bsr((5, 5))
+    >>> sparse_bsr.col_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsr((2, 1), 1)
+    tensor(crow_indices=tensor([0, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsc",
+    r"""
+to_sparse_bsc(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse column (BSC) storage format of
+given blocksize.  If the :attr:`self` is strided, then the number of
+dense dimensions could be specified, and a hybrid BSC tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSC tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsc = sparse.to_sparse_bsc((5, 5))
+    >>> sparse_bsc.row_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsc((2, 1), 1)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 1, 0]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsc)
+
+""",
+)
+
+add_docstr_all(
+    "to_mkldnn",
+    r"""
+to_mkldnn() -> Tensor
+Returns a copy of the tensor in ``torch.mkldnn`` layout.
+
+""",
+)
+
+add_docstr_all(
+    "trace",
+    r"""
+trace() -> Tensor
+
+See :func:`torch.trace`
+""",
+)
+
+add_docstr_all(
+    "transpose",
+    r"""
+transpose(dim0, dim1) -> Tensor
+
+See :func:`torch.transpose`
+""",
+)
+
+add_docstr_all(
+    "transpose_",
+    r"""
+transpose_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.transpose`
+""",
+)
+
+add_docstr_all(
+    "triangular_solve",
+    r"""
+triangular_solve(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+
+See :func:`torch.triangular_solve`
+""",
+)
+
+add_docstr_all(
+    "tril",
+    r"""
+tril(diagonal=0) -> Tensor
+
+See :func:`torch.tril`
+""",
+)
+
+add_docstr_all(
+    "tril_",
+    r"""
+tril_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.tril`
+""",
+)
+
+add_docstr_all(
+    "triu",
+    r"""
+triu(diagonal=0) -> Tensor
+
+See :func:`torch.triu`
+""",
+)
+
+add_docstr_all(
+    "triu_",
+    r"""
+triu_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.triu`
+""",
+)
+
+add_docstr_all(
+    "true_divide",
+    r"""
+true_divide(value) -> Tensor
+
+See :func:`torch.true_divide`
+""",
+)
+
+add_docstr_all(
+    "true_divide_",
+    r"""
+true_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.true_divide_`
+""",
+)
+
+add_docstr_all(
+    "trunc",
+    r"""
+trunc() -> Tensor
+
+See :func:`torch.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix",
+    r"""
+fix() -> Tensor
+
+See :func:`torch.fix`.
+""",
+)
+
+add_docstr_all(
+    "trunc_",
+    r"""
+trunc_() -> Tensor
+
+In-place version of :meth:`~Tensor.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix_",
+    r"""
+fix_() -> Tensor
+
+In-place version of :meth:`~Tensor.fix`
+""",
+)
+
+add_docstr_all(
+    "type",
+    r"""
+type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+Returns the type if `dtype` is not provided, else casts this object to
+the specified type.
+
+If this is already of the correct type, no copy is performed and the
+original object is returned.
+
+Args:
+    dtype (dtype or string): The desired type
+    non_blocking (bool): If ``True``, and the source is in pinned memory
+        and destination is on the GPU or vice versa, the copy is performed
+        asynchronously with respect to the host. Otherwise, the argument
+        has no effect.
+    **kwargs: For compatibility, may contain the key ``async`` in place of
+        the ``non_blocking`` argument. The ``async`` arg is deprecated.
+""",
+)
+
+add_docstr_all(
+    "type_as",
+    r"""
+type_as(tensor) -> Tensor
+
+Returns this tensor cast to the type of the given tensor.
+
+This is a no-op if the tensor is already of the correct type. This is
+equivalent to ``self.type(tensor.type())``
+
+Args:
+    tensor (Tensor): the tensor which has the desired type
+""",
+)
+
+add_docstr_all(
+    "unfold",
+    r"""
+unfold(dimension, size, step) -> Tensor
+
+Returns a view of the original tensor which contains all slices of size :attr:`size` from
+:attr:`self` tensor in the dimension :attr:`dimension`.
+
+Step between two slices is given by :attr:`step`.
+
+If `sizedim` is the size of dimension :attr:`dimension` for :attr:`self`, the size of
+dimension :attr:`dimension` in the returned tensor will be
+`(sizedim - size) / step + 1`.
+
+An additional dimension of size :attr:`size` is appended in the returned tensor.
+
+Args:
+    dimension (int): dimension in which unfolding happens
+    size (int): the size of each slice that is unfolded
+    step (int): the step between each slice
+
+Example::
+
+    >>> x = torch.arange(1., 8)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
+    >>> x.unfold(0, 2, 1)
+    tensor([[ 1.,  2.],
+            [ 2.,  3.],
+            [ 3.,  4.],
+            [ 4.,  5.],
+            [ 5.,  6.],
+            [ 6.,  7.]])
+    >>> x.unfold(0, 2, 2)
+    tensor([[ 1.,  2.],
+            [ 3.,  4.],
+            [ 5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "uniform_",
+    r"""
+uniform_(from=0, to=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the continuous uniform
+distribution:
+
+.. math::
+    f(x) = \dfrac{1}{\text{to} - \text{from}}
+""",
+)
+
+add_docstr_all(
+    "unsqueeze",
+    r"""
+unsqueeze(dim) -> Tensor
+
+See :func:`torch.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "unsqueeze_",
+    r"""
+unsqueeze_(dim) -> Tensor
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "var",
+    r"""
+var(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.var`
+""",
+)
+
+add_docstr_all(
+    "vdot",
+    r"""
+vdot(other) -> Tensor
+
+See :func:`torch.vdot`
+""",
+)
+
+add_docstr_all(
+    "view",
+    r"""
+view(*shape) -> Tensor
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`shape`.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. For a tensor to be viewed, the new
+view size must be compatible with its original size and stride, i.e., each new
+view dimension must either be a subspace of an original dimension, or only span
+across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+
+.. math::
+
+  \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+
+Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+:meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+returns a view if the shapes are compatible, and copies (equivalent to calling
+:meth:`contiguous`) otherwise.
+
+Args:
+    shape (torch.Size or int...): the desired size
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+
+    >>> a = torch.randn(1, 2, 3, 4)
+    >>> a.size()
+    torch.Size([1, 2, 3, 4])
+    >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+    >>> b.size()
+    torch.Size([1, 3, 2, 4])
+    >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+    >>> c.size()
+    torch.Size([1, 3, 2, 4])
+    >>> torch.equal(b, c)
+    False
+
+
+.. method:: view(dtype) -> Tensor
+   :noindex:
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`dtype`.
+
+If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+then the size of the last dimension of the output will be scaled
+proportionally.  For instance, if :attr:`dtype` element size is twice that of
+``self.dtype``, then each pair of elements in the last dimension of
+:attr:`self` will be combined, and the size of the last dimension of the output
+will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+be split in two, and the size of the last dimension of the output will be
+double that of :attr:`self`. For this to be possible, the following conditions
+must be true:
+
+    * ``self.dim()`` must be greater than 0.
+    * ``self.stride(-1)`` must be 1.
+
+Additionally, if the element size of :attr:`dtype` is greater than that of
+``self.dtype``, the following conditions must be true as well:
+
+    * ``self.size(-1)`` must be divisible by the ratio between the element
+      sizes of the dtypes.
+    * ``self.storage_offset()`` must be divisible by the ratio between the
+      element sizes of the dtypes.
+    * The strides of all dimensions, except the last dimension, must be
+      divisible by the ratio between the element sizes of the dtypes.
+
+If any of the above conditions are not met, an error is thrown.
+
+.. warning::
+
+    This overload is not supported by TorchScript, and using it in a Torchscript
+    program will cause undefined behavior.
+
+
+Args:
+    dtype (:class:`torch.dtype`): the desired dtype
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x
+    tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+    >>> x.dtype
+    torch.float32
+
+    >>> y = x.view(torch.int32)
+    >>> y
+    tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+            [-1105482831,  1061112040,  1057999968, -1084397505],
+            [-1071760287, -1123489973, -1097310419, -1084649136],
+            [-1101533110,  1073668768, -1082790149, -1088634448]],
+        dtype=torch.int32)
+    >>> y[0, 0] = 1000000000
+    >>> x
+    tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+
+    >>> x.view(torch.cfloat)
+    tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+            [-0.1520+0.7472j,  0.5617-0.8649j],
+            [-2.4724-0.0334j, -0.2976-0.8499j],
+            [-0.2109+1.9913j, -0.9607-0.6123j]])
+    >>> x.view(torch.cfloat).size()
+    torch.Size([4, 2])
+
+    >>> x.view(torch.uint8)
+    tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+               8, 191],
+            [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+              93, 191],
+            [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+              89, 191],
+            [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+              28, 191]], dtype=torch.uint8)
+    >>> x.view(torch.uint8).size()
+    torch.Size([4, 16])
+""",
+)
+
+add_docstr_all(
+    "view_as",
+    r"""
+view_as(other) -> Tensor
+
+View this tensor as the same size as :attr:`other`.
+``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+
+Please see :meth:`~Tensor.view` for more information about ``view``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "expand",
+    r"""
+expand(*sizes) -> Tensor
+
+Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+to a larger size.
+
+Passing -1 as the size for a dimension means not changing the size of
+that dimension.
+
+Tensor can be also expanded to a larger number of dimensions, and the
+new ones will be appended at the front. For the new dimensions, the
+size cannot be set to -1.
+
+Expanding a tensor does not allocate new memory, but only creates a
+new view on the existing tensor where a dimension of size one is
+expanded to a larger size by setting the ``stride`` to 0. Any dimension
+of size 1 can be expanded to an arbitrary value without allocating new
+memory.
+
+Args:
+    *sizes (torch.Size or int...): the desired expanded size
+
+.. warning::
+
+    More than one element of an expanded tensor may refer to a single
+    memory location. As a result, in-place operations (especially ones that
+    are vectorized) may result in incorrect behavior. If you need to write
+    to the tensors, please clone them first.
+
+Example::
+
+    >>> x = torch.tensor([[1], [2], [3]])
+    >>> x.size()
+    torch.Size([3, 1])
+    >>> x.expand(3, 4)
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+""",
+)
+
+add_docstr_all(
+    "expand_as",
+    r"""
+expand_as(other) -> Tensor
+
+Expand this tensor to the same size as :attr:`other`.
+``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+
+Please see :meth:`~Tensor.expand` for more information about ``expand``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "sum_to_size",
+    r"""
+sum_to_size(*size) -> Tensor
+
+Sum ``this`` tensor to :attr:`size`.
+:attr:`size` must be broadcastable to ``this`` tensor size.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+""",
+)
+
+
+add_docstr_all(
+    "zero_",
+    r"""
+zero_() -> Tensor
+
+Fills :attr:`self` tensor with zeros.
+""",
+)
+
+add_docstr_all(
+    "matmul",
+    r"""
+matmul(tensor2) -> Tensor
+
+See :func:`torch.matmul`
+""",
+)
+
+add_docstr_all(
+    "chunk",
+    r"""
+chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_chunk",
+    r"""
+unsafe_chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_split",
+    r"""
+unsafe_split(split_size, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_split`
+""",
+)
+
+add_docstr_all(
+    "tensor_split",
+    r"""
+tensor_split(indices_or_sections, dim=0) -> List of Tensors
+
+See :func:`torch.tensor_split`
+""",
+)
+
+add_docstr_all(
+    "hsplit",
+    r"""
+hsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.hsplit`
+""",
+)
+
+add_docstr_all(
+    "vsplit",
+    r"""
+vsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.vsplit`
+""",
+)
+
+add_docstr_all(
+    "dsplit",
+    r"""
+dsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.dsplit`
+""",
+)
+
+add_docstr_all(
+    "stft",
+    r"""
+stft(frame_length, hop, fft_size=None, return_onesided=True, window=None, pad_end=0) -> Tensor
+
+See :func:`torch.stft`
+""",
+)
+
+add_docstr_all(
+    "istft",
+    r"""
+istft(n_fft, hop_length=None, win_length=None, window=None,
+ center=True, normalized=False, onesided=True, length=None) -> Tensor
+
+See :func:`torch.istft`
+""",
+)
+
+add_docstr_all(
+    "det",
+    r"""
+det() -> Tensor
+
+See :func:`torch.det`
+""",
+)
+
+add_docstr_all(
+    "where",
+    r"""
+where(condition, y) -> Tensor
+
+``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+See :func:`torch.where`
+""",
+)
+
+add_docstr_all(
+    "logdet",
+    r"""
+logdet() -> Tensor
+
+See :func:`torch.logdet`
+""",
+)
+
+add_docstr_all(
+    "slogdet",
+    r"""
+slogdet() -> (Tensor, Tensor)
+
+See :func:`torch.slogdet`
+""",
+)
+
+add_docstr_all(
+    "unbind",
+    r"""
+unbind(dim=0) -> seq
+
+See :func:`torch.unbind`
+""",
+)
+
+add_docstr_all(
+    "pin_memory",
+    r"""
+pin_memory() -> Tensor
+
+Copies the tensor to pinned memory, if it's not already pinned.
+""",
+)
+
+add_docstr_all(
+    "pinverse",
+    r"""
+pinverse() -> Tensor
+
+See :func:`torch.pinverse`
+""",
+)
+
+add_docstr_all(
+    "index_add",
+    r"""
+index_add(dim, index, source, *, alpha=1) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_add_`.
+""",
+)
+
+add_docstr_all(
+    "index_copy",
+    r"""
+index_copy(dim, index, tensor2) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_copy_`.
+""",
+)
+
+add_docstr_all(
+    "index_fill",
+    r"""
+index_fill(dim, index, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+""",
+)
+
+add_docstr_all(
+    "scatter",
+    r"""
+scatter(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_`
+""",
+)
+
+add_docstr_all(
+    "scatter_add",
+    r"""
+scatter_add(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+""",
+)
+
+add_docstr_all(
+    "scatter_reduce",
+    r"""
+scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""",
+)
+
+add_docstr_all(
+    "masked_scatter",
+    r"""
+masked_scatter(mask, tensor) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
+
+.. note::
+
+    The inputs :attr:`self` and :attr:`mask`
+    :ref:`broadcast <broadcasting-semantics>`.
+
+Example:
+
+    >>> self = torch.tensor([0, 0, 0, 0, 0])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "xlogy",
+    r"""
+xlogy(other) -> Tensor
+
+See :func:`torch.xlogy`
+""",
+)
+
+add_docstr_all(
+    "xlogy_",
+    r"""
+xlogy_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.xlogy`
+""",
+)
+
+add_docstr_all(
+    "masked_fill",
+    r"""
+masked_fill(mask, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_fill_`
+""",
+)
+
+add_docstr_all(
+    "grad",
+    r"""
+This attribute is ``None`` by default and becomes a Tensor the first time a call to
+:func:`backward` computes gradients for ``self``.
+The attribute will then contain the gradients computed and future calls to
+:func:`backward` will accumulate (add) gradients into it.
+""",
+)
+
+add_docstr_all(
+    "retain_grad",
+    r"""
+retain_grad() -> None
+
+Enables this Tensor to have their :attr:`grad` populated during
+:func:`backward`. This is a no-op for leaf tensors.
+""",
+)
+
+add_docstr_all(
+    "retains_grad",
+    r"""
+Is ``True`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+populated during :func:`backward`, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "requires_grad",
+    r"""
+Is ``True`` if gradients need to be computed for this Tensor, ``False`` otherwise.
+
+.. note::
+
+    The fact that gradients need to be computed for a Tensor do not mean that the :attr:`grad`
+    attribute will be populated, see :attr:`is_leaf` for more details.
+
+""",
+)
+
+add_docstr_all(
+    "is_leaf",
+    r"""
+All Tensors that have :attr:`requires_grad` which is ``False`` will be leaf Tensors by convention.
+
+For Tensors that have :attr:`requires_grad` which is ``True``, they will be leaf Tensors if they were
+created by the user. This means that they are not the result of an operation and so
+:attr:`grad_fn` is None.
+
+Only leaf Tensors will have their :attr:`grad` populated during a call to :func:`backward`.
+To get :attr:`grad` populated for non-leaf Tensors, you can use :func:`retain_grad`.
+
+Example::
+
+    >>> a = torch.rand(10, requires_grad=True)
+    >>> a.is_leaf
+    True
+    >>> b = torch.rand(10, requires_grad=True).cuda()
+    >>> b.is_leaf
+    False
+    # b was created by the operation that cast a cpu Tensor into a cuda Tensor
+    >>> c = torch.rand(10, requires_grad=True) + 2
+    >>> c.is_leaf
+    False
+    # c was created by the addition operation
+    >>> d = torch.rand(10).cuda()
+    >>> d.is_leaf
+    True
+    # d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+    >>> e = torch.rand(10).cuda().requires_grad_()
+    >>> e.is_leaf
+    True
+    # e requires gradients and has no operations creating it
+    >>> f = torch.rand(10, requires_grad=True, device="cuda")
+    >>> f.is_leaf
+    True
+    # f requires grad, has no operation creating it
+
+
+""",
+)
+
+add_docstr_all(
+    "names",
+    r"""
+Stores names for each of this tensor's dimensions.
+
+``names[idx]`` corresponds to the name of tensor dimension ``idx``.
+Names are either a string if the dimension is named or ``None`` if the
+dimension is unnamed.
+
+Dimension names may contain characters or underscore. Furthermore, a dimension
+name must be a valid Python variable name (i.e., does not start with underscore).
+
+Tensors may not have two named dimensions with the same name.
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "is_cuda",
+    r"""
+Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_cpu",
+    r"""
+Is ``True`` if the Tensor is stored on the CPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xla",
+    r"""
+Is ``True`` if the Tensor is stored on an XLA device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_ipu",
+    r"""
+Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xpu",
+    r"""
+Is ``True`` if the Tensor is stored on the XPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_quantized",
+    r"""
+Is ``True`` if the Tensor is quantized, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_meta",
+    r"""
+Is ``True`` if the Tensor is a meta tensor, ``False`` otherwise.  Meta tensors
+are like normal tensors, but they carry no data.
+""",
+)
+
+add_docstr_all(
+    "is_mps",
+    r"""
+Is ``True`` if the Tensor is stored on the MPS device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse",
+    r"""
+Is ``True`` if the Tensor uses sparse COO storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse_csr",
+    r"""
+Is ``True`` if the Tensor uses sparse CSR storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "device",
+    r"""
+Is the :class:`torch.device` where this Tensor is.
+""",
+)
+
+add_docstr_all(
+    "ndim",
+    r"""
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "itemsize",
+    r"""
+Alias for :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "nbytes",
+    r"""
+Returns the number of bytes consumed by the "view" of elements of the Tensor
+if the Tensor does not use sparse storage layout.
+Defined to be :meth:`~Tensor.numel()` * :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "T",
+    r"""
+Returns a view of this tensor with its dimensions reversed.
+
+If ``n`` is the number of dimensions in ``x``,
+``x.T`` is equivalent to ``x.permute(n-1, n-2, ..., 0)``.
+
+.. warning::
+    The use of :func:`Tensor.T` on tensors of dimension other than 2 to reverse their shape
+    is deprecated and it will throw an error in a future release. Consider :attr:`~.Tensor.mT`
+    to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse
+    the dimensions of a tensor.
+""",
+)
+
+add_docstr_all(
+    "H",
+    r"""
+Returns a view of a matrix (2-D tensor) conjugated and transposed.
+
+``x.H`` is equivalent to ``x.transpose(0, 1).conj()`` for complex matrices and
+``x.transpose(0, 1)`` for real matrices.
+
+.. seealso::
+
+        :attr:`~.Tensor.mH`: An attribute that also works on batches of matrices.
+""",
+)
+
+add_docstr_all(
+    "mT",
+    r"""
+Returns a view of this tensor with the last two dimensions transposed.
+
+``x.mT`` is equivalent to ``x.transpose(-2, -1)``.
+""",
+)
+
+add_docstr_all(
+    "mH",
+    r"""
+Accessing this property is equivalent to calling :func:`adjoint`.
+""",
+)
+
+add_docstr_all(
+    "adjoint",
+    r"""
+adjoint() -> Tensor
+
+Alias for :func:`adjoint`
+""",
+)
+
+add_docstr_all(
+    "real",
+    r"""
+Returns a new tensor containing real values of the :attr:`self` tensor for a complex-valued input tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+Returns :attr:`self` if :attr:`self` is a real-valued tensor tensor.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.real
+    tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+
+""",
+)
+
+add_docstr_all(
+    "imag",
+    r"""
+Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+.. warning::
+    :func:`imag` is only supported for tensors with complex dtypes.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.imag
+    tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+
+""",
+)
+
+add_docstr_all(
+    "as_subclass",
+    r"""
+as_subclass(cls) -> Tensor
+
+Makes a ``cls`` instance with the same data pointer as ``self``. Changes
+in the output mirror changes in ``self``, and the output stays attached
+to the autograd graph. ``cls`` must be a subclass of ``Tensor``.
+""",
+)
+
+add_docstr_all(
+    "crow_indices",
+    r"""
+crow_indices() -> IntTensor
+
+Returns the tensor containing the compressed row indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``crow_indices`` tensor is strictly of shape (:attr:`self`.size(0) + 1)
+and of type ``int32`` or ``int64``. When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.crow_indices()
+    tensor([0, 1, 2, 3, 4, 5], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "col_indices",
+    r"""
+col_indices() -> IntTensor
+
+Returns the tensor containing the column indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``col_indices`` tensor is strictly of shape (:attr:`self`.nnz())
+and of type ``int32`` or ``int64``.  When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.col_indices()
+    tensor([0, 1, 2, 3, 4], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "to_padded_tensor",
+    r"""
+to_padded_tensor(padding, output_size=None) -> Tensor
+See :func:`to_padded_tensor`
+""",
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..409dc8e42c59f53e637fb2e0e1cd19ac9e9163ef
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py
@@ -0,0 +1,937 @@
+import copyreg
+import functools
+import sys
+import traceback
+import warnings
+from collections import defaultdict
+from typing import Any, DefaultDict, List, Optional
+
+import torch
+
+
+def _type(self, dtype=None, non_blocking=False, **kwargs):
+    """Returns the type if `dtype` is not provided, else casts this object to
+    the specified type.
+
+    If this is already of the correct type, no copy is performed and the
+    original object is returned.
+
+    Args:
+        dtype (type or string): The desired type
+        non_blocking (bool): If ``True``, and the source is in pinned memory
+            and destination is on the GPU or vice versa, the copy is performed
+            asynchronously with respect to the host. Otherwise, the argument
+            has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument. The ``async`` arg is deprecated.
+    """
+    non_blocking = _get_async_or_non_blocking("type", non_blocking, kwargs)
+    if dtype is None:
+        return self.__module__ + "." + self.__class__.__name__
+
+    if isinstance(dtype, str):
+        dtype = _import_dotted_name(dtype)
+    if dtype == type(self):
+        return self
+    if self.is_sparse:
+        if not dtype.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_module_name = dtype.__module__.replace(".sparse", "")
+        new_values_type_name = new_module_name + "." + dtype.__name__
+        new_values = torch.Tensor._values(self).type(new_values_type_name, non_blocking)
+        new_indices_type_name = new_module_name + ".LongTensor"
+        new_indices = torch.Tensor._indices(self).type(
+            new_indices_type_name, non_blocking
+        )
+        return dtype(new_indices, new_values, self.size())
+    if dtype.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
+    return dtype(self.size()).copy_(self, non_blocking)
+
+
+def _hpu(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in HPU memory.
+
+    If this object is already in HPU memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination HPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("hpu", non_blocking, kwargs)
+    hpu = getattr(torch, "hpu", None)
+    assert hpu is not None, "HPU device module is not loaded"
+    if self.is_hpu:
+        if device is None:
+            device = hpu.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with hpu.device(device):
+        assert not self.is_sparse, "sparse storage is not supported for HPU tensors"
+        untyped_storage = torch.UntypedStorage(self.size(), device=torch.device("hpu"))
+        untyped_storage.copy_(self, non_blocking)
+        return untyped_storage
+
+
+def _cuda(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in CUDA memory.
+
+    If this object is already in CUDA memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination GPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("cuda", non_blocking, kwargs)
+    if self.is_cuda:
+        if device is None:
+            device = torch.cuda.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = torch.Tensor._indices(self).cuda(device, non_blocking)
+            values = torch.Tensor._values(self).cuda(device, non_blocking)
+            return new_type(indices, values, self.size())
+        else:
+            untyped_storage = torch.UntypedStorage(
+                self.size(), device=torch.device("cuda")
+            )
+            untyped_storage.copy_(self, non_blocking)
+            return untyped_storage
+
+
+def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
+    """Return the non-blocking flag given the function name and kwargs.
+
+    Args:
+        function_name (str): the name of the function being used.
+        non_blocking (bool): the default value.
+        **kwargs (dict): the kwargs passed to the function.
+    """
+    if not kwargs:
+        return non_blocking
+    if len(kwargs) != 1 or "async" not in kwargs:
+        message = "{}() got an unexpected keyword argument '{}'"
+        argument = list(kwargs.keys()).pop()
+        raise TypeError(message.format(function_name, argument))
+    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    return kwargs["async"]
+
+
+# Note [Don't serialize hooks]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Since time immemorial, we have serialized the backward hooks associated with
+# variables.  This kind of half-worked--Python can pickle global functions
+# (but not closures!)--but there were problems.
+#
+#   - It's fragile.  If you serialize a backward hook into a saved
+#     model, and then you rename the function associated with the hook,
+#     now your saved model is broken and you can't load it anymore.
+#
+#   - It's not actually used.  The standard recommendation is to
+#     serialize the *state_dict* of a model, not the model itself
+#     (since this is more stable to code changes affecting the model
+#     serialization), and the state dict saves "data" only, thus
+#     stripping the backward hooks.  In some cases, hooks are
+#     essential to the well-functioning of a model (e.g., DDP),
+#     but DDP already manages readding the hooks!
+#
+#   - We didn't serialize them in many cases.  Prior to #10220, we
+#     were dropping backward hooks in ForkingPickler.  We "fixed" this
+#     to be convenient with other serialization sites, but lack of
+#     serializing backward hooks wasn't actually the root cause of
+#     the bug.
+#
+# With these cases in mind, we have decided that a better strategy
+# is to just NOT serialize hooks at all.
+#
+# Since this is a BC-breaking change, we should warn when we previously
+# serialized a hook, but no longer do so. This will be done by adding a special
+# sentinel property to hooks will be used to suppress this warning. If a hook
+# has the property _torch_serialize_ignore, we will not emit a warning if we
+# attempt to serialize a Tensor with this hook attached to it.
+#
+# By the way, when _backward_hooks is skipped, we must give an EMPTY
+# OrderedDict(), if you pass a None you'll run afoul #12219.
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_tensor(storage, storage_offset, size, stride):
+    # first construct a tensor with the correct dtype/device
+    t = torch.empty((0,), dtype=storage.dtype, device=storage._untyped_storage.device)
+    return t.set_(storage._untyped_storage, storage_offset, size, stride)
+
+
+def get_tensor_metadata(tensor):
+    # Tensor's Metadata for serializing.
+    # Currently, this only returns a dict[string, bool] specifing whether
+    # `conj` or `neg` bit is set.
+    assert isinstance(tensor, torch.Tensor)
+    return torch._C._get_tensor_metadata(tensor)  # type: ignore[attr-defined]
+
+
+def set_tensor_metadata(tensor, metadata):
+    # See `get_tensor_metadata` above
+    assert isinstance(metadata, dict)
+    assert isinstance(tensor, torch.Tensor)
+    torch._C._set_tensor_metadata(tensor, metadata)  # type: ignore[attr-defined]
+
+
+def _rebuild_tensor_v2(
+    storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None
+):
+    tensor = _rebuild_tensor(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    if metadata:
+        set_tensor_metadata(tensor, metadata)
+
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_tensor_v3(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    requires_grad,
+    backward_hooks,
+    dtype,
+    metadata=None,
+):
+    t = torch.empty(
+        (0,),
+        dtype=dtype,
+        device=storage._untyped_storage.device,
+        requires_grad=requires_grad,
+    )
+    t.set_(storage._untyped_storage, storage_offset, size, stride)
+    if metadata:
+        set_tensor_metadata(t, metadata)
+    t._backward_hooks = backward_hooks
+    return t
+
+
+_sparse_tensors_to_validate: List["torch.Tensor"] = []
+
+
+# In _legacy_load() in serialization.py we unpickle storages after the sparse
+# tensors have been already unpickled. Those storages contain data necessary for
+# validating sparse tensors: indices and values. That's why sparse tensors are
+# first unpickled without any validation, and then this function is called just
+# before _legacy_load() returns, so that all the sparse tensors can be validated
+# in bulk.
+#
+# The same procedure must be followed by _load() in serialization.py because due
+# to Pickler semantics, we have to use the same (non-validating) function for
+# unpickling sparse tensors, regardless of the caller.
+def _validate_loaded_sparse_tensors():
+    try:
+        for t in _sparse_tensors_to_validate:
+            if t.layout is torch.sparse_coo:
+                torch._validate_sparse_coo_tensor_args(
+                    t._indices(), t._values(), t.size(), t.is_coalesced()
+                )
+            elif t.layout in {
+                torch.sparse_csr,
+                torch.sparse_csc,
+                torch.sparse_bsr,
+                torch.sparse_bsc,
+            }:
+                # TODO: Validation currently involves an expensive traversal
+                # on CPU, which may include a device transfer.
+                if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                    compressed_indices, plain_indices = (
+                        t.crow_indices(),
+                        t.col_indices(),
+                    )
+                else:
+                    compressed_indices, plain_indices = (
+                        t.ccol_indices(),
+                        t.row_indices(),
+                    )
+                torch._validate_sparse_compressed_tensor_args(
+                    compressed_indices, plain_indices, t.values(), t.size(), t.layout
+                )
+            else:
+                raise NotImplementedError(
+                    f"_validate_loaded_sparse_tensors for layout `{t.layout}`"
+                )
+
+    finally:
+        _sparse_tensors_to_validate.clear()
+
+
+def _rebuild_sparse_tensor(layout, data):
+    """
+    Rebuilds a sparse tensor from its sparse storage representation.
+
+    Args:
+        layout (str): The sparse storage layout of the tensor.
+        data (tuple): The tensor's sparse storage representation.
+    """
+    if layout == torch.sparse_coo:
+        if len(data) == 3:
+            # For BC:
+            indices, values, size = data
+            is_coalesced = None
+        else:
+            indices, values, size, is_coalesced = data
+        result = torch.sparse_coo_tensor(
+            indices, values, size, check_invariants=False, is_coalesced=is_coalesced
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    elif layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        compressed_indices, plain_indices, values, size = data
+        result = torch.sparse_compressed_tensor(
+            compressed_indices,
+            plain_indices,
+            values,
+            size,
+            layout=layout,
+            check_invariants=False,
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    raise NotImplementedError(f"rebuilding sparse tensor for layout {layout}")
+
+
+def _rebuild_nested_tensor(buffer, sizes, strides, storage_offsets):
+    return torch._nested_view_from_buffer(buffer, sizes, strides, storage_offsets)
+
+
+def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad):
+    tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
+    tensor.requires_grad = requires_grad
+    return tensor
+
+
+# Should not be used, only here to be able to load Tensors serialized with older versions of pytorch
+_rebuild_xla_tensor = _rebuild_device_tensor_from_numpy
+
+
+def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
+    return torch.empty_strided(
+        size, stride, dtype=dtype, device="meta", requires_grad=requires_grad
+    )
+
+
+def _rebuild_wrapper_subclass(
+    cls, dtype, size, stride, storage_offset, layout, device, requires_grad
+):
+    return torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+        cls,
+        size,
+        strides=stride,
+        storage_offset=storage_offset,
+        layout=layout,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_qtensor(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    quantizer_params,
+    requires_grad,
+    backward_hooks,
+):
+    qscheme = quantizer_params[0]
+    if qscheme == torch.per_tensor_affine:
+        _, scale, zero_point = quantizer_params
+        tensor = torch._empty_affine_quantized(
+            size,
+            scale=scale,
+            zero_point=zero_point,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    elif qscheme in (torch.per_channel_affine, torch.per_channel_affine_float_qparams):
+        _, scales, zero_points, axis = quantizer_params
+        if type(scales) is list and type(zero_points) is list:
+            if qscheme == torch.per_channel_affine:
+                scales = torch.tensor(scales, dtype=torch.double, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.long, device=storage.device
+                )
+            else:
+                scales = torch.tensor(scales, dtype=torch.float, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.float, device=storage.device
+                )
+        tensor = torch._empty_per_channel_affine_quantized(
+            size,
+            scales=scales,
+            zero_points=zero_points,
+            axis=axis,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    else:
+        raise RuntimeError(f"Can't deserialize quantized tensor with qscheme {qscheme}")
+    tensor.set_(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_parameter(data, requires_grad, backward_hooks):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    return param
+
+
+def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    # Restore state on Parameter like python attr.
+    param = _set_obj_state(param, state)
+    return param
+
+
+def _get_obj_state(obj):
+    # Get the state of the python subclass
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+    # from it, we cannot call that function directly
+    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    # Note that starting with Python 3.11, this `__getstate__` is always defined and thus
+    # the else branch will never be taken.
+    getstate_fn = getattr(obj, "__getstate__", None)
+    if getstate_fn:
+        state = getstate_fn()
+    else:
+        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
+        if slots_to_save:
+            state = (
+                obj.__dict__,
+                {
+                    name: getattr(obj, name)
+                    for name in slots_to_save
+                    if hasattr(obj, name)
+                },
+            )
+        else:
+            state = obj.__dict__
+
+    return state
+
+
+def _set_obj_state(obj, state):
+    if isinstance(state, tuple):
+        if not len(state) == 2:
+            raise RuntimeError(f"Invalid serialized state: {state}")
+        dict_state = state[0]
+        slots_state = state[1]
+    else:
+        dict_state = state
+        slots_state = None
+
+    # Starting with Python 3.11, the __dict__ attribute is lazily created
+    # and is serialized as None when not needed.
+    if dict_state:
+        for k, v in dict_state.items():
+            setattr(obj, k, v)
+
+    if slots_state:
+        for k, v in slots_state.items():
+            setattr(obj, k, v)
+    return obj
+
+
+def _import_dotted_name(name):
+    components = name.split(".")
+    obj = __import__(components[0])
+    for component in components[1:]:
+        obj = getattr(obj, component)
+    return obj
+
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+
+    Args:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    return torch._C._nn.flatten_dense_tensors(tensors)
+
+
+def _flatten_sparse_tensors(tensors):
+    """Flatten sparse tensors into two contiguous 1D buffers, one of indices and
+    one of values. Assume tensors are of same sparse type.
+
+    Args:
+        tensors (Iterable[Tensor]): sparse tensors to flatten.
+
+    Returns:
+        A tuple of two contiguous 1D buffers, one containing input tensors'
+        indices and the other containing the values.
+    """
+    flat_indices = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._indices(t) for t in tensors]
+    )
+    flat_values = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._values(t) for t in tensors]
+    )
+    return flat_indices, flat_values
+
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+
+    Args:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    return torch._C._nn.unflatten_dense_tensors(flat, tensors)
+
+
+def _unflatten_sparse_tensors(flat, tensors):
+    """View flat buffer (containing indices and values) using the sizes of
+    tensors. Assume that tensors are of same sparse type, and that flat is given
+    by _flatten_sparse_tensors.
+
+    Args:
+        flat (tuple(Tensor, Tensor)): flattened indices and values of sparse
+          tensors to unflatten.
+        tensors (Iterable[Tensor]): sparse tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened sparse tensors with sizes same as tensors and values from
+        flat.
+    """
+    flat_indices, flat_values = flat
+    indices = torch._C._nn.unflatten_dense_tensors(
+        flat_indices, [torch.Tensor._indices(t) for t in tensors]
+    )
+    values = torch._C._nn.unflatten_dense_tensors(
+        flat_values, [torch.Tensor._values(t) for t in tensors]
+    )
+    outputs = []
+    for t, i, v in zip(tensors, indices, values):
+        outputs.append(t.new(i, v, t.size()))
+    return tuple(outputs)
+
+
+def _reorder_tensors_as(tensors, ordered_tensors):
+    """Assume that tensors are of same order as ordered_tensors within their
+    types, e.g., from _take_tensors. Reorder them to be of same order as
+    ordered_tensors.
+
+    Args:
+        tensors (Iterable[Tensor]): tensors to be reordered. They should be of
+          the same order as ordered_tensors within their own types.
+        ordered_tensors (Iterable[Tensor]): tensors whose order will be the
+          reference.
+
+    Returns:
+        Ordered tuple of tensors with contents from tensors and order of
+        ordered_tensors.
+    """
+    type_dict = defaultdict(list)
+    for tensor in tensors:
+        type_dict[tensor.type()].append(tensor)
+    type_dict_ = {t: iter(coll) for t, coll in type_dict.items()}
+    return tuple(next(type_dict_[tensor.type()]) for tensor in ordered_tensors)
+
+
+def _take_tensors(tensors, size_limit):
+    """Group tensors into chunks. This generator yields a chunk at each time,
+    each containing tensors of same type up to certain byte limit in total size.
+
+    Args:
+        tensors (Sequence): A sequence of tensors to be separated into chunks.
+        size_limit (int): The limit of each chunk in bytes.
+
+    Yields:
+        Blocks of tensors of same type and within size_limit. The yielded
+        tensors are only ordered as the original sequence within its types.
+    """
+    buf_dict: DefaultDict[str, List] = defaultdict(lambda: [[], 0])
+    for tensor in tensors:
+        t = tensor.type()
+        if tensor.is_sparse:
+            indices = torch.Tensor._indices(tensor)
+            values = torch.Tensor._values(tensor)
+            size = (
+                indices.numel() * indices.element_size()
+                + values.numel() * values.element_size()
+            )
+        else:
+            size = tensor.numel() * tensor.element_size()
+        buf_and_size = buf_dict[t]
+        if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
+            yield buf_and_size[0]
+            buf_and_size = buf_dict[t] = [[], 0]
+        buf_and_size[0].append(tensor)
+        buf_and_size[1] += size
+    for buf, _ in buf_dict.values():
+        if len(buf) > 0:
+            yield buf
+
+
+# annotation decorator to get annotations in a way that is compatible
+# with both Python 2 and 3
+def annotate(ret, **kwargs):
+    def dec(fun):
+        fun.__annotations__ = dict(kwargs)
+        fun.__annotations__["return"] = ret
+        return fun
+
+    return dec
+
+
+def render_call(fn, args, kwargs):
+    str_fn = torch.overrides.resolve_name(fn)
+    if str_fn is None:
+        str_fn = str(fn)
+
+    str_args: List[str] = []
+    with torch._tensor_str.printoptions(threshold=0, edgeitems=0):
+        str_args.extend(repr(a) for a in args)
+        str_args.extend(f"{k}={repr(v)}" for k, v in kwargs.items())
+        r = f"{str_fn}({', '.join(str_args)})"
+    return r
+
+
+# NOTE [ Python Traceback Reference Cycle Problem ]
+#
+# When using sys.exc_info(), it is important to **not** store the exc_info[2],
+# which is the traceback, because otherwise you will run into the traceback
+# reference cycle problem, i.e., the traceback holding reference to the frame,
+# and the frame (which holds reference to all the object in its temporary scope)
+# holding reference the traceback.
+
+
+class KeyErrorMessage(str):
+    r"""str subclass that returns itself in repr"""
+
+    def __repr__(self):
+        return self
+
+
+class ExceptionWrapper:
+    r"""Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info=None, where="in background"):
+        # It is important that we don't store exc_info, see
+        # NOTE [ Python Traceback Reference Cycle Problem ]
+        if exc_info is None:
+            exc_info = sys.exc_info()
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+        self.where = where
+
+    def reraise(self):
+        r"""Reraises the wrapped exception in the current thread"""
+        # Format a message such as: "Caught ValueError in DataLoader worker
+        # process 2. Original Traceback:", followed by the traceback.
+        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"
+        if self.exc_type == KeyError:
+            # KeyError calls repr() on its argument (usually a dict key). This
+            # makes stack traces unreadable. It will not be changed in Python
+            # (https://bugs.python.org/issue2651), so we work around it.
+            msg = KeyErrorMessage(msg)
+        elif getattr(self.exc_type, "message", None):
+            # Some exceptions have first argument as non-str but explicitly
+            # have message field
+            raise self.exc_type(message=msg)
+        try:
+            exception = self.exc_type(msg)
+        except TypeError:
+            # If the exception takes multiple arguments, don't try to
+            # instantiate since we don't know how to
+            raise RuntimeError(msg) from None
+        raise exception
+
+
+def _get_available_device_type():
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
+        return "xpu"
+    custom_backend_name = torch._C._get_privateuse1_backend_name()
+    custom_device_mod = getattr(torch, custom_backend_name, None)
+    if custom_device_mod and custom_device_mod.is_available():
+        return custom_backend_name
+    # add more available device types here
+    return None
+
+
+def _get_device_attr(get_member):
+    device_type = _get_available_device_type()
+    if device_type and device_type.lower() == "cuda":
+        return get_member(torch.cuda)
+    if device_type and device_type.lower() == "xpu":
+        return get_member(torch.xpu)  # type: ignore[attr-defined]
+    if device_type == torch._C._get_privateuse1_backend_name():
+        return get_member(getattr(torch, device_type))
+    # add more available device types here
+    return None
+
+
+def _get_current_device_index():
+    # current device index
+    return _get_device_attr(lambda m: m.current_device())
+
+
+def _get_all_device_indices():
+    # all device index
+    return _get_device_attr(lambda m: list(range(m.device_count())))
+
+
+def _get_devices_properties(device_ids):
+    # all device properties
+    return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
+
+
+def get_current_device_index() -> int:
+    r"""Checks if there are CUDA devices available and
+    returns the device index of the current default CUDA device.
+    Returns -1 in case there are no CUDA devices available.
+    Arguments: ``None``
+    """
+    if torch.cuda.device_count() > 0:
+        return torch.cuda.current_device()
+    return -1
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    has index. Note that for a device without a specified index,
+    i.e., ``torch.device('xxx')``, this will return the current default
+    device of that type if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default
+    device of the supported runtime platform if :attr:`optional` is ``True``.
+    i.e., the current default CUDA device will be returned if CUDA runtime is supported.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    device_idx: Optional[int] = None
+    if isinstance(device, torch.device):
+        if not allow_cpu and device.type == "cpu":
+            raise ValueError(f"Expected a non cpu device, but got: {device}")
+        device_idx = -1 if device.type == "cpu" else device.index
+    if isinstance(device, int):
+        device_idx = device
+    if device_idx is None:
+        if optional:
+            # The eager API _get_current_device_index uses `lambda` functions which are
+            # not supported in JIT and hence not scriptable. The JIT equivalent API to get
+            # the current device index is `get_current_device_index()` which can
+            # be scripted. We use is_scripting to check the mode we are in and call the
+            # appropriate API.
+            if torch.jit.is_scripting():
+                device_idx = get_current_device_index()
+            else:
+                device_idx = _get_current_device_index()
+        else:
+            raise ValueError(
+                f"Expected a torch.device with a specified index or an integer, but got:{device}"
+            )
+    return device_idx
+
+
+def _handle_complex(tensor):
+    """
+    Returns a real view of a tensor if complex dtype else just the tensor
+    need to check if a UninitializedParameter because otherwise checking is_complex is an error for a LazyModule
+    """
+    return (
+        torch.view_as_real(tensor)
+        if not isinstance(tensor, torch.nn.UninitializedParameter)
+        and tensor.is_complex()
+        else tensor
+    )
+
+
+def _element_size(dtype):
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if not isinstance(dtype, torch.dtype):
+        raise RuntimeError(f"expected torch.dtype, but got {type(dtype)}")
+
+    if dtype.is_complex:
+        return torch.finfo(dtype).bits >> 2
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype).bits >> 3
+    elif dtype == torch.bool:
+        # NOTE: torch.bool is not supported in torch.iinfo()
+        return 1
+    else:
+        return torch.iinfo(dtype).bits >> 3
+
+
+class _ClassPropertyDescriptor:
+    def __init__(self, fget, fset=None):
+        self.fget = fget
+
+    def __get__(self, instance, owner=None):
+        if owner is None:
+            owner = type(instance)
+        return self.fget.__get__(instance, owner)()
+
+
+def classproperty(func):
+    if not isinstance(func, (classmethod, staticmethod)):
+        func = classmethod(func)
+    return _ClassPropertyDescriptor(func)
+
+
+def is_compiling() -> bool:
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    TODO(khabinov): we should deprecate this function and use torch.compiler.is_compiling().
+    """
+    return torch.compiler.is_compiling()
+
+
+def _functionalize_sync(t):
+    # This code lives in python instead of C++ since conditioning on a certain python subclass
+    # is much more of a pain in C++.
+    from torch._subclasses.functional_tensor import FunctionalTensor
+
+    if isinstance(t, FunctionalTensor):
+        # If a FunctionalTensorMode is active while syncing, we don't want it to intercept any ops that get called
+        # when we sync our inner tensor.
+        # Why?
+        # (1) If there are input mutations in the graph, then they will be re-applied during
+        #     AOTAutograd when we call _sync() from inside of our functionalization kernels.
+        # (2) _sync() causes us to regenerate our updated the tensor from the updated base,
+        #     which dispatches to a bunch of view ops
+        # (3) The input to these view ops is our inner FunctionalTensorWrapper
+        #     (since the sync was called from C++), not the python FunctionalTensor
+        # (4) if a python FunctionalTensorMode is active, it will complain when it intercepts
+        #     the view op, since it will see an input that is a C++ FunctionalTensorWrapper
+        #     (aka a normal torch.Tensor) instead of a python `FunctionalTensor).
+        maybe_functional_mode = torch._C._unset_dispatch_mode(
+            torch._C._TorchDispatchModeKey.FUNCTIONAL
+        )
+        try:
+            torch._functionalize_sync(t.elem)  # type: ignore[attr-defined]
+        finally:
+            if maybe_functional_mode is not None:
+                torch._C._set_dispatch_mode(maybe_functional_mode)
+    else:
+        torch._functionalize_sync(t)  # type: ignore[attr-defined]
+
+
+@functools.lru_cache(2)
+def _get_device_module(device_type: str):
+    device_module = getattr(torch, device_type, None)
+    if device_module is None:
+        raise RuntimeError(
+            f"Device '{device_type}' does not have a corresponding module registered as 'torch.{device_type}'."
+        )
+    return device_module
+
+
+def _dummy_type(name: str) -> type:
+    def get_err_fn(is_init: bool):
+        def err_fn(obj, *args, **kwargs):
+            if is_init:
+                class_name = obj.__class__.__name__
+            else:
+                class_name = obj.__name__
+            raise RuntimeError(f"Tried to instantiate dummy base class {class_name}")
+
+        return err_fn
+
+    return type(
+        name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)}
+    )
+
+
+class _LazySeedTracker:
+    # Since seeding is memory-less, only track the latest seed.
+    # Note: `manual_seed_all` followed by `manual_seed` overwrites
+    # the seed on current device. We track the order of **latest**
+    # calls between these two API.
+    def __init__(self):
+        self.manual_seed_all_cb = None
+        self.manual_seed_cb = None
+        self.call_order = []
+
+    def queue_seed_all(self, cb, traceback):
+        self.manual_seed_all_cb = (cb, traceback)
+        # update seed_all to be latest
+        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
+
+    def queue_seed(self, cb, traceback):
+        self.manual_seed_cb = (cb, traceback)
+        # update seed to be latest
+        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
+
+    def get_calls(self) -> List:
+        return self.call_order
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73a0b71c1a815be6b15d1972fed8350004d6721
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py
@@ -0,0 +1,58 @@
+# mypy: ignore-errors
+
+from typing import Any, Iterable
+from .version import __version__ as internal_version
+from ._vendor.packaging.version import Version, InvalidVersion
+
+__all__ = ['TorchVersion']
+
+
+class TorchVersion(str):
+    """A string with magic powers to compare to both Version and iterables!
+    Prior to 1.10.0 torch.__version__ was stored as a str and so many did
+    comparisons against torch.__version__ as if it were a str. In order to not
+    break them we have TorchVersion which masquerades as a str while also
+    having the ability to compare against both packaging.version.Version as
+    well as tuples of values, eg. (1, 2, 1)
+    Examples:
+        Comparing a TorchVersion object to a Version object
+            TorchVersion('1.10.0a') > Version('1.10.0a')
+        Comparing a TorchVersion object to a Tuple object
+            TorchVersion('1.10.0a') > (1, 2)    # 1.2
+            TorchVersion('1.10.0a') > (1, 2, 1) # 1.2.1
+        Comparing a TorchVersion object against a string
+            TorchVersion('1.10.0a') > '1.2'
+            TorchVersion('1.10.0a') > '1.2.1'
+    """
+    # fully qualified type names here to appease mypy
+    def _convert_to_version(self, inp: Any) -> Any:
+        if isinstance(inp, Version):
+            return inp
+        elif isinstance(inp, str):
+            return Version(inp)
+        elif isinstance(inp, Iterable):
+            # Ideally this should work for most cases by attempting to group
+            # the version tuple, assuming the tuple looks (MAJOR, MINOR, ?PATCH)
+            # Examples:
+            #   * (1)         -> Version("1")
+            #   * (1, 20)     -> Version("1.20")
+            #   * (1, 20, 1)  -> Version("1.20.1")
+            return Version('.'.join(str(item) for item in inp))
+        else:
+            raise InvalidVersion(inp)
+
+    def _cmp_wrapper(self, cmp: Any, method: str) -> bool:
+        try:
+            return getattr(Version(self), method)(self._convert_to_version(cmp))
+        except BaseException as e:
+            if not isinstance(e, InvalidVersion):
+                raise
+            # Fall back to regular string comparison if dealing with an invalid
+            # version like 'parrot'
+            return getattr(super(), method)(cmp)
+
+
+for cmp_method in ["__gt__", "__lt__", "__eq__", "__ge__", "__le__"]:
+    setattr(TorchVersion, cmp_method, lambda x, y, method=cmp_method: x._cmp_wrapper(y, method))
+
+__version__ = TorchVersion(internal_version)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3fa8b448f04f6bf16c695252e12dbc1bab046a1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py
@@ -0,0 +1,8 @@
+from typing import Optional
+
+__all__ = ['__version__', 'debug', 'cuda', 'git_version', 'hip']
+__version__ = '2.3.0+cu118'
+debug = False
+cuda: Optional[str] = '11.8'
+git_version = '97ff6cfd9c86c5c09d7ce775ab64ec5c99230f5d'
+hip: Optional[str] = None