diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f41880f9c71951ed384ef0af6f73351c2ea74afe Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_src/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f197dc6057a69444de6ec953d91eeda6d5f7043e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/__pycache__/magic_trace.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..3984a063885907141b56bdd2c6e8cc730c592cbb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/delayed_mul_tensor.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from . import _Tensor, Tensor +from .reference import _dims, _enable_layers, llist, ltuple + + +class DelayedMulTensor(_Tensor): + def __init__(self, lhs, rhs): + self._lhs, self._rhs = lhs, rhs + self._data = None + self._levels_data = None + self._has_device = lhs._has_device or rhs._has_device + self._batchtensor_data = None + self._tensor_data = None + + @property + def _levels(self): + if self._levels_data is None: + levels = llist(self._lhs._levels) + for l in self._rhs._levels: + if l not in levels: + levels.append(l) + self._levels_data = ltuple(levels) + return self._levels_data + + @property + def _batchtensor(self): + if self._batchtensor_data is None: + with _enable_layers(self._levels): + print("bt multiply fallback") + self._batchtensor_data = self._lhs._batchtensor * self._rhs._batchtensor + return self._batchtensor_data + + @property + def _tensor(self): + if self._tensor_data is None: + self._tensor_data = Tensor.from_batched( + self._batchtensor, self._has_device + )._tensor + return self._tensor_data + + @property + def ndim(self): + return self._batchtensor.ndim + + @property + def dims(self): + return ltuple(super().dims) + + def sum(self, dim): + dims = _dims(dim, 0, False, False) + n = ord("a") + all_levels = self._levels + + def to_char(d): + return chr(n + all_levels.index(d)) + + plhs, levelslhs = self._lhs._tensor, self._lhs._levels + prhs, levelsrhs = self._rhs._tensor, self._rhs._levels + new_dims = tuple(d for d in self.dims if d not in dims) + new_levels = [l for l in self._levels if l not in dims] + fmt = "".join( + [ + *(to_char(d) for d in levelslhs), + ",", + *(to_char(d) for d in levelsrhs), + "->", + *(to_char(d) for d in new_levels), + ] + ) + result_data = torch.einsum(fmt, (plhs, prhs)) + return Tensor.from_positional(result_data, new_levels, True) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py new file mode 100644 index 0000000000000000000000000000000000000000..5c962a898ca79cfe3d8af7432aacc3802d4f4ade --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/magic_trace.py @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import os +import signal +import subprocess +from contextlib import contextmanager + + +@contextmanager +def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"): + pid = os.getpid() + if not os.path.exists(magic_trace_cache): + print(f"Downloading magic_trace to: {magic_trace_cache}") + subprocess.run( + [ + "wget", + "-O", + magic_trace_cache, + "-q", + "https://github.com/janestreet/magic-trace/releases/download/v1.0.2/magic-trace", + ] + ) + subprocess.run(["chmod", "+x", magic_trace_cache]) + args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output] + p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8") + while True: + x = p.stderr.readline() + print(x) + if "Attached" in x: + break + try: + yield + finally: + p.send_signal(signal.SIGINT) + r = p.wait() + print(p.stderr.read()) + p.stderr.close() + if r != 0: + raise ValueError(f"magic_trace exited abnormally: {r}") diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py new file mode 100644 index 0000000000000000000000000000000000000000..e2146c4a21a144dc3942e304d1406ace47df0e57 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/dim/wrap_type.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from types import ( + BuiltinMethodType, + FunctionType, + GetSetDescriptorType, + MethodDescriptorType, + WrapperDescriptorType, +) + +from functorch._C import dim as _C + +_wrap_method = _C._wrap_method + +FUNC_TYPES = ( + FunctionType, + MethodDescriptorType, + BuiltinMethodType, + WrapperDescriptorType, +) +PROPERTY_TYPES = (GetSetDescriptorType, property) + + +def _py_wrap_method(orig, __torch_function__): + def impl(*args, **kwargs): + return __torch_function__(orig, None, args, kwargs) + + return impl + + +def wrap_type(use_c, to_patch, pattern, __torch_function__): + if use_c: + wrap_method = _wrap_method + else: + wrap_method = _py_wrap_method + + all = {} + for t in reversed(pattern.mro()[:-1]): # skip object + all.update(t.__dict__) + + def wrap_attr(orig): + return property(wrap_method(orig.__get__, __torch_function__)) + + for name, obj in all.items(): + if name in ( + "__dict__", + "__new__", + "__init__", + "__repr__", + "__weakref__", + "__doc__", + "__module__", + "__dir__", + ): + continue + + # skip things that have been overloaded + # things that come from object like `__eq__` still need to be patched, however. + if hasattr(to_patch, name) and getattr(to_patch, name) is not getattr( + object, name, None + ): + continue + + if isinstance(obj, FUNC_TYPES): + setattr(to_patch, name, wrap_method(obj, __torch_function__)) + elif isinstance(obj, PROPERTY_TYPES): + setattr(to_patch, name, wrap_attr(obj)) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b32751d6e2493ab6a81f5a7f91a572553201f466 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__init__.py @@ -0,0 +1,3 @@ +from .rearrange import rearrange + +__all__ = ["rearrange"] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7a227ef9b0bb7d412d1932e16f8e47a44dc7dec Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7efe8a4d2413cea7c15db76c14c384b902eb6966 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/_parsing.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a88a12cfd1141217b8f1aa03cef5d171ea04368 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/__pycache__/rearrange.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py new file mode 100644 index 0000000000000000000000000000000000000000..0449bb7ed2c72ef68f966f253c99e8570dfbd7ef --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/einops/rearrange.py @@ -0,0 +1,207 @@ +from __future__ import annotations + +import functools +from typing import Callable, Dict, List, Sequence, Tuple, Union + +import torch + +from functorch._C import dim as _C +from ._parsing import ( + _ellipsis, + AnonymousAxis, + comma_separate, + parse_pattern, + validate_rearrange_expressions, +) + +__all__ = ["rearrange"] + +dims = _C.dims + + +@functools.lru_cache(256) +def _create_rearrange_callable( + tensor_ndim: int, pattern: str, **axes_lengths: int +) -> Callable[[torch.Tensor], torch.Tensor]: + r"""Translate an `einops`-style pattern into a callable that performs the rearrange using first-class dimensions. + + Since the an equivalent result is computed for tensors with the same number of dimensions, with the same pattern and + specified axes lengths, this function can be memoized. + + Args: + tensor_ndim (int): the number of dimensions in the tensor to rearrange + pattern (str): the `einops`-style rearrangement pattern + axes_lengths (int): any additional length specifications for dimensions + + Returns: + Callable[[torch.Tensor], torch.Tensor]: a callable that performs the rearrangement + """ + left, right = parse_pattern(pattern, axes_lengths) + validate_rearrange_expressions(left, right, axes_lengths) + + n_anon_dims = sum(not dim for dim in left.composition) + if left.has_ellipsis: + n_ellipsis_dims = tensor_ndim - (len(left.composition) - 1) + n_named_dims = len(left.identifiers) - 1 + + if (pattern_ndim := n_anon_dims + n_named_dims) > tensor_ndim: + raise ValueError( + f"Number of dimensions in pattern ({pattern_ndim}) must be less than or equal to the number of " + f"dimensions in the tensor ({tensor_ndim})" + ) + else: + n_ellipsis_dims = 0 + n_named_dims = len(left.identifiers) + + if (pattern_ndim := len(left.composition)) != tensor_ndim: + raise ValueError( + f"Number of dimensions in pattern ({pattern_ndim}) must be equal to the number of dimensions in " + f"the tensor ({tensor_ndim})" + ) + n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims + + if n_dims == 0: + # an identity rearrangement on a 0-dimension tensor + return lambda tensor: tensor + + first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims)) + identifier_dim_map: Dict[Union[str, AnonymousAxis], Tuple[str, ...]] = {} + anon_axes: List[AnonymousAxis] = [] + + # map the left-hand side identifiers to strings representing first class dims + dims_i = 0 + for dimension in left.composition: + if isinstance(dimension, list): + for identifier in dimension: + # non-unitary anon axes are not allowed in rearrange & unitary anon axes are represented as empty lists + assert isinstance(identifier, str) + identifier_dim_map[identifier] = (first_class_dims[dims_i],) + dims_i += 1 + if not dimension: + # unitary anonymous axis + anon_axis = AnonymousAxis("1") + identifier_dim_map[anon_axis] = (first_class_dims[dims_i],) + anon_axes.append(anon_axis) + dimension.append(anon_axis) + dims_i += 1 + elif dimension == _ellipsis: + identifier = _ellipsis + identifier_dim_map[identifier] = tuple( + first_class_dims[dims_i + j] for j in range(n_ellipsis_dims) + ) + dims_i += n_ellipsis_dims + else: + raise ValueError(f"Unexpected dimension: {dimension}") + + def composition_to_dims( + composition: Sequence[Union[List[Union[str, AnonymousAxis]], str]] + ) -> List[Union[str, Tuple[str, ...]]]: + """Convert a `ParsedExpression.composition` into a `Tensor.__getitem__` index of strings representing first + class dims.""" + dim_composition: List[Union[str, Tuple[str, ...]]] = [] + for dimension in composition: + if isinstance(dimension, list): + dim_composition.append( + tuple( + dim + for identifier in dimension + for dim in identifier_dim_map[identifier] + ) + ) + elif dimension == _ellipsis: + dim_composition.extend(identifier_dim_map[_ellipsis]) + else: + raise ValueError(f"Unexpected dimension: {dimension}") + return dim_composition + + left_dims = composition_to_dims(left.composition) + right_dims = composition_to_dims(right.composition) + anon_dims = tuple(identifier_dim_map[axis][0] for axis in anon_axes) + specified_lengths = tuple( + (identifier_dim_map[axis][0], length) for axis, length in axes_lengths.items() + ) + + custom_rearrange_callable_name = "do_rearrange" + custom_rearrange_callable_code = ( + ( + f"def {custom_rearrange_callable_name}(tensor):\n" + f" {comma_separate(first_class_dims)} = dims({n_dims})\n" + ) + + ( + "".join( + f" {dim}.size = {length}\n" for (dim, length) in specified_lengths + ) + if specified_lengths + else "" + ) + + f" tensor = tensor[{comma_separate(left_dims)}].order({comma_separate(right_dims)})\n" + + ( + f" return tensor.sum({comma_separate([anon_dims])}, keepdim=False)\n" + if anon_dims + else " return tensor\n" + ) + ) + + exec(custom_rearrange_callable_code) + return locals()[custom_rearrange_callable_name] + + +def rearrange( + tensor: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor, ...]], + pattern: str, + **axes_lengths: int, +) -> torch.Tensor: + r"""A native implementation of `einops.rearrange`, a reader-friendly smart element reordering for multidimensional + tensors. This operation includes functionality of transpose (axes permutation), reshape (view), squeeze, unsqueeze, + stack, concatenate and other operations. + + See: https://einops.rocks/api/rearrange/ + + Args: + tensor (Tensor or sequence of Tensor): the tensor(s) to rearrange + pattern (str): the rearrangement pattern + axes_lengths (int): any additional length specifications for dimensions + + Returns: + Tensor: the rearranged tensor + + Examples: + >>> # suppose we have a set of 32 images in "h w c" format (height-width-channel) + >>> images = torch.randn((32, 30, 40, 3)) + + >>> # stack along first (batch) axis, output is a single array + >>> rearrange(images, 'b h w c -> b h w c').shape + torch.Size([32, 30, 40, 3]) + + >>> # concatenate images along height (vertical axis), 960 = 32 * 30 + >>> rearrange(images, 'b h w c -> (b h) w c').shape + torch.Size([960, 40, 3]) + + >>> # concatenated images along horizontal axis, 1280 = 32 * 40 + >>> rearrange(images, 'b h w c -> h (b w) c').shape + torch.Size([30, 1280, 3]) + + >>> # reordered axes to "b c h w" format for deep learning + >>> rearrange(images, 'b h w c -> b c h w').shape + torch.Size([32, 3, 30, 40]) + + >>> # flattened each image into a vector, 3600 = 30 * 40 * 3 + >>> rearrange(images, 'b h w c -> b (c h w)').shape + torch.Size([32, 3600]) + + >>> # split each image into 4 smaller (top-left, top-right, bottom-left, bottom-right), 128 = 32 * 2 * 2 + >>> rearrange(images, 'b (h1 h) (w1 w) c -> (b h1 w1) h w c', h1=2, w1=2).shape + torch.Size([128, 15, 20, 3]) + + >>> # space-to-depth operation + >>> rearrange(images, 'b (h h1) (w w1) c -> b h w (c h1 w1)', h1=2, w1=2).shape + torch.Size([32, 15, 20, 12]) + """ + if not isinstance(tensor, torch.Tensor): + tensor = torch.stack(tensor) + + rearrange_callable = _create_rearrange_callable( + tensor.ndim, pattern, **axes_lengths + ) + + return rearrange_callable(tensor) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e5986013fa6cd921cc602202135dcd26b80a075 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbfb721a5d758f1532e0b2d45dd4497651bfff74 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/__pycache__/quadrature.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py new file mode 100644 index 0000000000000000000000000000000000000000..7ca5cc598fb53491cb6ae4a41a40477c58544d53 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/calculus/approximation.py @@ -0,0 +1,246 @@ +from ..libmp.backend import xrange +from .calculus import defun + +#----------------------------------------------------------------------------# +# Approximation methods # +#----------------------------------------------------------------------------# + +# The Chebyshev approximation formula is given at: +# http://mathworld.wolfram.com/ChebyshevApproximationFormula.html + +# The only major changes in the following code is that we return the +# expanded polynomial coefficients instead of Chebyshev coefficients, +# and that we automatically transform [a,b] -> [-1,1] and back +# for convenience. + +# Coefficient in Chebyshev approximation +def chebcoeff(ctx,f,a,b,j,N): + s = ctx.mpf(0) + h = ctx.mpf(0.5) + for k in range(1, N+1): + t = ctx.cospi((k-h)/N) + s += f(t*(b-a)*h + (b+a)*h) * ctx.cospi(j*(k-h)/N) + return 2*s/N + +# Generate Chebyshev polynomials T_n(ax+b) in expanded form +def chebT(ctx, a=1, b=0): + Tb = [1] + yield Tb + Ta = [b, a] + while 1: + yield Ta + # Recurrence: T[n+1](ax+b) = 2*(ax+b)*T[n](ax+b) - T[n-1](ax+b) + Tmp = [0] + [2*a*t for t in Ta] + for i, c in enumerate(Ta): Tmp[i] += 2*b*c + for i, c in enumerate(Tb): Tmp[i] -= c + Ta, Tb = Tmp, Ta + +@defun +def chebyfit(ctx, f, interval, N, error=False): + r""" + Computes a polynomial of degree `N-1` that approximates the + given function `f` on the interval `[a, b]`. With ``error=True``, + :func:`~mpmath.chebyfit` also returns an accurate estimate of the + maximum absolute error; that is, the maximum value of + `|f(x) - P(x)|` for `x \in [a, b]`. + + :func:`~mpmath.chebyfit` uses the Chebyshev approximation formula, + which gives a nearly optimal solution: that is, the maximum + error of the approximating polynomial is very close to + the smallest possible for any polynomial of the same degree. + + Chebyshev approximation is very useful if one needs repeated + evaluation of an expensive function, such as function defined + implicitly by an integral or a differential equation. (For + example, it could be used to turn a slow mpmath function + into a fast machine-precision version of the same.) + + **Examples** + + Here we use :func:`~mpmath.chebyfit` to generate a low-degree approximation + of `f(x) = \cos(x)`, valid on the interval `[1, 2]`:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> poly, err = chebyfit(cos, [1, 2], 5, error=True) + >>> nprint(poly) + [0.00291682, 0.146166, -0.732491, 0.174141, 0.949553] + >>> nprint(err, 12) + 1.61351758081e-5 + + The polynomial can be evaluated using ``polyval``:: + + >>> nprint(polyval(poly, 1.6), 12) + -0.0291858904138 + >>> nprint(cos(1.6), 12) + -0.0291995223013 + + Sampling the true error at 1000 points shows that the error + estimate generated by ``chebyfit`` is remarkably good:: + + >>> error = lambda x: abs(cos(x) - polyval(poly, x)) + >>> nprint(max([error(1+n/1000.) for n in range(1000)]), 12) + 1.61349954245e-5 + + **Choice of degree** + + The degree `N` can be set arbitrarily high, to obtain an + arbitrarily good approximation. As a rule of thumb, an + `N`-term Chebyshev approximation is good to `N/(b-a)` decimal + places on a unit interval (although this depends on how + well-behaved `f` is). The cost grows accordingly: ``chebyfit`` + evaluates the function `(N^2)/2` times to compute the + coefficients and an additional `N` times to estimate the error. + + **Possible issues** + + One should be careful to use a sufficiently high working + precision both when calling ``chebyfit`` and when evaluating + the resulting polynomial, as the polynomial is sometimes + ill-conditioned. It is for example difficult to reach + 15-digit accuracy when evaluating the polynomial using + machine precision floats, no matter the theoretical + accuracy of the polynomial. (The option to return the + coefficients in Chebyshev form should be made available + in the future.) + + It is important to note the Chebyshev approximation works + poorly if `f` is not smooth. A function containing singularities, + rapid oscillation, etc can be approximated more effectively by + multiplying it by a weight function that cancels out the + nonsmooth features, or by dividing the interval into several + segments. + """ + a, b = ctx._as_points(interval) + orig = ctx.prec + try: + ctx.prec = orig + int(N**0.5) + 20 + c = [chebcoeff(ctx,f,a,b,k,N) for k in range(N)] + d = [ctx.zero] * N + d[0] = -c[0]/2 + h = ctx.mpf(0.5) + T = chebT(ctx, ctx.mpf(2)/(b-a), ctx.mpf(-1)*(b+a)/(b-a)) + for (k, Tk) in zip(range(N), T): + for i in range(len(Tk)): + d[i] += c[k]*Tk[i] + d = d[::-1] + # Estimate maximum error + err = ctx.zero + for k in range(N): + x = ctx.cos(ctx.pi*k/N) * (b-a)*h + (b+a)*h + err = max(err, abs(f(x) - ctx.polyval(d, x))) + finally: + ctx.prec = orig + if error: + return d, +err + else: + return d + +@defun +def fourier(ctx, f, interval, N): + r""" + Computes the Fourier series of degree `N` of the given function + on the interval `[a, b]`. More precisely, :func:`~mpmath.fourier` returns + two lists `(c, s)` of coefficients (the cosine series and sine + series, respectively), such that + + .. math :: + + f(x) \sim \sum_{k=0}^N + c_k \cos(k m x) + s_k \sin(k m x) + + where `m = 2 \pi / (b-a)`. + + Note that many texts define the first coefficient as `2 c_0` instead + of `c_0`. The easiest way to evaluate the computed series correctly + is to pass it to :func:`~mpmath.fourierval`. + + **Examples** + + The function `f(x) = x` has a simple Fourier series on the standard + interval `[-\pi, \pi]`. The cosine coefficients are all zero (because + the function has odd symmetry), and the sine coefficients are + rational numbers:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> c, s = fourier(lambda x: x, [-pi, pi], 5) + >>> nprint(c) + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + >>> nprint(s) + [0.0, 2.0, -1.0, 0.666667, -0.5, 0.4] + + This computes a Fourier series of a nonsymmetric function on + a nonstandard interval:: + + >>> I = [-1, 1.5] + >>> f = lambda x: x**2 - 4*x + 1 + >>> cs = fourier(f, I, 4) + >>> nprint(cs[0]) + [0.583333, 1.12479, -1.27552, 0.904708, -0.441296] + >>> nprint(cs[1]) + [0.0, -2.6255, 0.580905, 0.219974, -0.540057] + + It is instructive to plot a function along with its truncated + Fourier series:: + + >>> plot([f, lambda x: fourierval(cs, I, x)], I) #doctest: +SKIP + + Fourier series generally converge slowly (and may not converge + pointwise). For example, if `f(x) = \cosh(x)`, a 10-term Fourier + series gives an `L^2` error corresponding to 2-digit accuracy:: + + >>> I = [-1, 1] + >>> cs = fourier(cosh, I, 9) + >>> g = lambda x: (cosh(x) - fourierval(cs, I, x))**2 + >>> nprint(sqrt(quad(g, I))) + 0.00467963 + + :func:`~mpmath.fourier` uses numerical quadrature. For nonsmooth functions, + the accuracy (and speed) can be improved by including all singular + points in the interval specification:: + + >>> nprint(fourier(abs, [-1, 1], 0), 10) + ([0.5000441648], [0.0]) + >>> nprint(fourier(abs, [-1, 0, 1], 0), 10) + ([0.5], [0.0]) + + """ + interval = ctx._as_points(interval) + a = interval[0] + b = interval[-1] + L = b-a + cos_series = [] + sin_series = [] + cutoff = ctx.eps*10 + for n in xrange(N+1): + m = 2*n*ctx.pi/L + an = 2*ctx.quadgl(lambda t: f(t)*ctx.cos(m*t), interval)/L + bn = 2*ctx.quadgl(lambda t: f(t)*ctx.sin(m*t), interval)/L + if n == 0: + an /= 2 + if abs(an) < cutoff: an = ctx.zero + if abs(bn) < cutoff: bn = ctx.zero + cos_series.append(an) + sin_series.append(bn) + return cos_series, sin_series + +@defun +def fourierval(ctx, series, interval, x): + """ + Evaluates a Fourier series (in the format computed by + by :func:`~mpmath.fourier` for the given interval) at the point `x`. + + The series should be a pair `(c, s)` where `c` is the + cosine series and `s` is the sine series. The two lists + need not have the same length. + """ + cs, ss = series + ab = ctx._as_points(interval) + a = interval[0] + b = interval[-1] + m = 2*ctx.pi/(ab[-1]-ab[0]) + s = ctx.zero + s += ctx.fsum(cs[n]*ctx.cos(m*n*x) for n in xrange(len(cs)) if cs[n]) + s += ctx.fsum(ss[n]*ctx.sin(m*n*x) for n in xrange(len(ss)) if ss[n]) + return s diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..293697b9fcf8bd82d58ac4ff45acd73fadac82f9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__init__.py @@ -0,0 +1,2 @@ +from . import eigen # to set methods +from . import eigen_symmetric # to set methods diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b74f0998c300da19fe3afa7884007e82feb24eed Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/__pycache__/calculus.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py new file mode 100644 index 0000000000000000000000000000000000000000..7fae2a7a9a29898241ed41810331b480ff70798f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/calculus.py @@ -0,0 +1,531 @@ +from ..libmp.backend import xrange + +# TODO: should use diagonalization-based algorithms + +class MatrixCalculusMethods(object): + + def _exp_pade(ctx, a): + """ + Exponential of a matrix using Pade approximants. + + See G. H. Golub, C. F. van Loan 'Matrix Computations', + third Ed., page 572 + + TODO: + - find a good estimate for q + - reduce the number of matrix multiplications to improve + performance + """ + def eps_pade(p): + return ctx.mpf(2)**(3-2*p) * \ + ctx.factorial(p)**2/(ctx.factorial(2*p)**2 * (2*p + 1)) + q = 4 + extraq = 8 + while 1: + if eps_pade(q) < ctx.eps: + break + q += 1 + q += extraq + j = int(max(1, ctx.mag(ctx.mnorm(a,'inf')))) + extra = q + prec = ctx.prec + ctx.dps += extra + 3 + try: + a = a/2**j + na = a.rows + den = ctx.eye(na) + num = ctx.eye(na) + x = ctx.eye(na) + c = ctx.mpf(1) + for k in range(1, q+1): + c *= ctx.mpf(q - k + 1)/((2*q - k + 1) * k) + x = a*x + cx = c*x + num += cx + den += (-1)**k * cx + f = ctx.lu_solve_mat(den, num) + for k in range(j): + f = f*f + finally: + ctx.prec = prec + return f*1 + + def expm(ctx, A, method='taylor'): + r""" + Computes the matrix exponential of a square matrix `A`, which is defined + by the power series + + .. math :: + + \exp(A) = I + A + \frac{A^2}{2!} + \frac{A^3}{3!} + \ldots + + With method='taylor', the matrix exponential is computed + using the Taylor series. With method='pade', Pade approximants + are used instead. + + **Examples** + + Basic examples:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> expm(zeros(3)) + [1.0 0.0 0.0] + [0.0 1.0 0.0] + [0.0 0.0 1.0] + >>> expm(eye(3)) + [2.71828182845905 0.0 0.0] + [ 0.0 2.71828182845905 0.0] + [ 0.0 0.0 2.71828182845905] + >>> expm([[1,1,0],[1,0,1],[0,1,0]]) + [ 3.86814500615414 2.26812870852145 0.841130841230196] + [ 2.26812870852145 2.44114713886289 1.42699786729125] + [0.841130841230196 1.42699786729125 1.6000162976327] + >>> expm([[1,1,0],[1,0,1],[0,1,0]], method='pade') + [ 3.86814500615414 2.26812870852145 0.841130841230196] + [ 2.26812870852145 2.44114713886289 1.42699786729125] + [0.841130841230196 1.42699786729125 1.6000162976327] + >>> expm([[1+j, 0], [1+j,1]]) + [(1.46869393991589 + 2.28735528717884j) 0.0] + [ (1.03776739863568 + 3.536943175722j) (2.71828182845905 + 0.0j)] + + Matrices with large entries are allowed:: + + >>> expm(matrix([[1,2],[2,3]])**25) + [5.65024064048415e+2050488462815550 9.14228140091932e+2050488462815550] + [9.14228140091932e+2050488462815550 1.47925220414035e+2050488462815551] + + The identity `\exp(A+B) = \exp(A) \exp(B)` does not hold for + noncommuting matrices:: + + >>> A = hilbert(3) + >>> B = A + eye(3) + >>> chop(mnorm(A*B - B*A)) + 0.0 + >>> chop(mnorm(expm(A+B) - expm(A)*expm(B))) + 0.0 + >>> B = A + ones(3) + >>> mnorm(A*B - B*A) + 1.8 + >>> mnorm(expm(A+B) - expm(A)*expm(B)) + 42.0927851137247 + + """ + if method == 'pade': + prec = ctx.prec + try: + A = ctx.matrix(A) + ctx.prec += 2*A.rows + res = ctx._exp_pade(A) + finally: + ctx.prec = prec + return res + A = ctx.matrix(A) + prec = ctx.prec + j = int(max(1, ctx.mag(ctx.mnorm(A,'inf')))) + j += int(0.5*prec**0.5) + try: + ctx.prec += 10 + 2*j + tol = +ctx.eps + A = A/2**j + T = A + Y = A**0 + A + k = 2 + while 1: + T *= A * (1/ctx.mpf(k)) + if ctx.mnorm(T, 'inf') < tol: + break + Y += T + k += 1 + for k in xrange(j): + Y = Y*Y + finally: + ctx.prec = prec + Y *= 1 + return Y + + def cosm(ctx, A): + r""" + Gives the cosine of a square matrix `A`, defined in analogy + with the matrix exponential. + + Examples:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> X = eye(3) + >>> cosm(X) + [0.54030230586814 0.0 0.0] + [ 0.0 0.54030230586814 0.0] + [ 0.0 0.0 0.54030230586814] + >>> X = hilbert(3) + >>> cosm(X) + [ 0.424403834569555 -0.316643413047167 -0.221474945949293] + [-0.316643413047167 0.820646708837824 -0.127183694770039] + [-0.221474945949293 -0.127183694770039 0.909236687217541] + >>> X = matrix([[1+j,-2],[0,-j]]) + >>> cosm(X) + [(0.833730025131149 - 0.988897705762865j) (1.07485840848393 - 0.17192140544213j)] + [ 0.0 (1.54308063481524 + 0.0j)] + """ + B = 0.5 * (ctx.expm(A*ctx.j) + ctx.expm(A*(-ctx.j))) + if not sum(A.apply(ctx.im).apply(abs)): + B = B.apply(ctx.re) + return B + + def sinm(ctx, A): + r""" + Gives the sine of a square matrix `A`, defined in analogy + with the matrix exponential. + + Examples:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> X = eye(3) + >>> sinm(X) + [0.841470984807897 0.0 0.0] + [ 0.0 0.841470984807897 0.0] + [ 0.0 0.0 0.841470984807897] + >>> X = hilbert(3) + >>> sinm(X) + [0.711608512150994 0.339783913247439 0.220742837314741] + [0.339783913247439 0.244113865695532 0.187231271174372] + [0.220742837314741 0.187231271174372 0.155816730769635] + >>> X = matrix([[1+j,-2],[0,-j]]) + >>> sinm(X) + [(1.29845758141598 + 0.634963914784736j) (-1.96751511930922 + 0.314700021761367j)] + [ 0.0 (0.0 - 1.1752011936438j)] + """ + B = (-0.5j) * (ctx.expm(A*ctx.j) - ctx.expm(A*(-ctx.j))) + if not sum(A.apply(ctx.im).apply(abs)): + B = B.apply(ctx.re) + return B + + def _sqrtm_rot(ctx, A, _may_rotate): + # If the iteration fails to converge, cheat by performing + # a rotation by a complex number + u = ctx.j**0.3 + return ctx.sqrtm(u*A, _may_rotate) / ctx.sqrt(u) + + def sqrtm(ctx, A, _may_rotate=2): + r""" + Computes a square root of the square matrix `A`, i.e. returns + a matrix `B = A^{1/2}` such that `B^2 = A`. The square root + of a matrix, if it exists, is not unique. + + **Examples** + + Square roots of some simple matrices:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> sqrtm([[1,0], [0,1]]) + [1.0 0.0] + [0.0 1.0] + >>> sqrtm([[0,0], [0,0]]) + [0.0 0.0] + [0.0 0.0] + >>> sqrtm([[2,0],[0,1]]) + [1.4142135623731 0.0] + [ 0.0 1.0] + >>> sqrtm([[1,1],[1,0]]) + [ (0.920442065259926 - 0.21728689675164j) (0.568864481005783 + 0.351577584254143j)] + [(0.568864481005783 + 0.351577584254143j) (0.351577584254143 - 0.568864481005783j)] + >>> sqrtm([[1,0],[0,1]]) + [1.0 0.0] + [0.0 1.0] + >>> sqrtm([[-1,0],[0,1]]) + [(0.0 - 1.0j) 0.0] + [ 0.0 (1.0 + 0.0j)] + >>> sqrtm([[j,0],[0,j]]) + [(0.707106781186547 + 0.707106781186547j) 0.0] + [ 0.0 (0.707106781186547 + 0.707106781186547j)] + + A square root of a rotation matrix, giving the corresponding + half-angle rotation matrix:: + + >>> t1 = 0.75 + >>> t2 = t1 * 0.5 + >>> A1 = matrix([[cos(t1), -sin(t1)], [sin(t1), cos(t1)]]) + >>> A2 = matrix([[cos(t2), -sin(t2)], [sin(t2), cos(t2)]]) + >>> sqrtm(A1) + [0.930507621912314 -0.366272529086048] + [0.366272529086048 0.930507621912314] + >>> A2 + [0.930507621912314 -0.366272529086048] + [0.366272529086048 0.930507621912314] + + The identity `(A^2)^{1/2} = A` does not necessarily hold:: + + >>> A = matrix([[4,1,4],[7,8,9],[10,2,11]]) + >>> sqrtm(A**2) + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + >>> sqrtm(A)**2 + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + >>> A = matrix([[-4,1,4],[7,-8,9],[10,2,11]]) + >>> sqrtm(A**2) + [ 7.43715112194995 -0.324127569985474 1.8481718827526] + [-0.251549715716942 9.32699765900402 2.48221180985147] + [ 4.11609388833616 0.775751877098258 13.017955697342] + >>> chop(sqrtm(A)**2) + [-4.0 1.0 4.0] + [ 7.0 -8.0 9.0] + [10.0 2.0 11.0] + + For some matrices, a square root does not exist:: + + >>> sqrtm([[0,1], [0,0]]) + Traceback (most recent call last): + ... + ZeroDivisionError: matrix is numerically singular + + Two examples from the documentation for Matlab's ``sqrtm``:: + + >>> mp.dps = 15; mp.pretty = True + >>> sqrtm([[7,10],[15,22]]) + [1.56669890360128 1.74077655955698] + [2.61116483933547 4.17786374293675] + >>> + >>> X = matrix(\ + ... [[5,-4,1,0,0], + ... [-4,6,-4,1,0], + ... [1,-4,6,-4,1], + ... [0,1,-4,6,-4], + ... [0,0,1,-4,5]]) + >>> Y = matrix(\ + ... [[2,-1,-0,-0,-0], + ... [-1,2,-1,0,-0], + ... [0,-1,2,-1,0], + ... [-0,0,-1,2,-1], + ... [-0,-0,-0,-1,2]]) + >>> mnorm(sqrtm(X) - Y) + 4.53155328326114e-19 + + """ + A = ctx.matrix(A) + # Trivial + if A*0 == A: + return A + prec = ctx.prec + if _may_rotate: + d = ctx.det(A) + if abs(ctx.im(d)) < 16*ctx.eps and ctx.re(d) < 0: + return ctx._sqrtm_rot(A, _may_rotate-1) + try: + ctx.prec += 10 + tol = ctx.eps * 128 + Y = A + Z = I = A**0 + k = 0 + # Denman-Beavers iteration + while 1: + Yprev = Y + try: + Y, Z = 0.5*(Y+ctx.inverse(Z)), 0.5*(Z+ctx.inverse(Y)) + except ZeroDivisionError: + if _may_rotate: + Y = ctx._sqrtm_rot(A, _may_rotate-1) + break + else: + raise + mag1 = ctx.mnorm(Y-Yprev, 'inf') + mag2 = ctx.mnorm(Y, 'inf') + if mag1 <= mag2*tol: + break + if _may_rotate and k > 6 and not mag1 < mag2 * 0.001: + return ctx._sqrtm_rot(A, _may_rotate-1) + k += 1 + if k > ctx.prec: + raise ctx.NoConvergence + finally: + ctx.prec = prec + Y *= 1 + return Y + + def logm(ctx, A): + r""" + Computes a logarithm of the square matrix `A`, i.e. returns + a matrix `B = \log(A)` such that `\exp(B) = A`. The logarithm + of a matrix, if it exists, is not unique. + + **Examples** + + Logarithms of some simple matrices:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> X = eye(3) + >>> logm(X) + [0.0 0.0 0.0] + [0.0 0.0 0.0] + [0.0 0.0 0.0] + >>> logm(2*X) + [0.693147180559945 0.0 0.0] + [ 0.0 0.693147180559945 0.0] + [ 0.0 0.0 0.693147180559945] + >>> logm(expm(X)) + [1.0 0.0 0.0] + [0.0 1.0 0.0] + [0.0 0.0 1.0] + + A logarithm of a complex matrix:: + + >>> X = matrix([[2+j, 1, 3], [1-j, 1-2*j, 1], [-4, -5, j]]) + >>> B = logm(X) + >>> nprint(B) + [ (0.808757 + 0.107759j) (2.20752 + 0.202762j) (1.07376 - 0.773874j)] + [ (0.905709 - 0.107795j) (0.0287395 - 0.824993j) (0.111619 + 0.514272j)] + [(-0.930151 + 0.399512j) (-2.06266 - 0.674397j) (0.791552 + 0.519839j)] + >>> chop(expm(B)) + [(2.0 + 1.0j) 1.0 3.0] + [(1.0 - 1.0j) (1.0 - 2.0j) 1.0] + [ -4.0 -5.0 (0.0 + 1.0j)] + + A matrix `X` close to the identity matrix, for which + `\log(\exp(X)) = \exp(\log(X)) = X` holds:: + + >>> X = eye(3) + hilbert(3)/4 + >>> X + [ 1.25 0.125 0.0833333333333333] + [ 0.125 1.08333333333333 0.0625] + [0.0833333333333333 0.0625 1.05] + >>> logm(expm(X)) + [ 1.25 0.125 0.0833333333333333] + [ 0.125 1.08333333333333 0.0625] + [0.0833333333333333 0.0625 1.05] + >>> expm(logm(X)) + [ 1.25 0.125 0.0833333333333333] + [ 0.125 1.08333333333333 0.0625] + [0.0833333333333333 0.0625 1.05] + + A logarithm of a rotation matrix, giving back the angle of + the rotation:: + + >>> t = 3.7 + >>> A = matrix([[cos(t),sin(t)],[-sin(t),cos(t)]]) + >>> chop(logm(A)) + [ 0.0 -2.58318530717959] + [2.58318530717959 0.0] + >>> (2*pi-t) + 2.58318530717959 + + For some matrices, a logarithm does not exist:: + + >>> logm([[1,0], [0,0]]) + Traceback (most recent call last): + ... + ZeroDivisionError: matrix is numerically singular + + Logarithm of a matrix with large entries:: + + >>> logm(hilbert(3) * 10**20).apply(re) + [ 45.5597513593433 1.27721006042799 0.317662687717978] + [ 1.27721006042799 42.5222778973542 2.24003708791604] + [0.317662687717978 2.24003708791604 42.395212822267] + + """ + A = ctx.matrix(A) + prec = ctx.prec + try: + ctx.prec += 10 + tol = ctx.eps * 128 + I = A**0 + B = A + n = 0 + while 1: + B = ctx.sqrtm(B) + n += 1 + if ctx.mnorm(B-I, 'inf') < 0.125: + break + T = X = B-I + L = X*0 + k = 1 + while 1: + if k & 1: + L += T / k + else: + L -= T / k + T *= X + if ctx.mnorm(T, 'inf') < tol: + break + k += 1 + if k > ctx.prec: + raise ctx.NoConvergence + finally: + ctx.prec = prec + L *= 2**n + return L + + def powm(ctx, A, r): + r""" + Computes `A^r = \exp(A \log r)` for a matrix `A` and complex + number `r`. + + **Examples** + + Powers and inverse powers of a matrix:: + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = True + >>> A = matrix([[4,1,4],[7,8,9],[10,2,11]]) + >>> powm(A, 2) + [ 63.0 20.0 69.0] + [174.0 89.0 199.0] + [164.0 48.0 179.0] + >>> chop(powm(powm(A, 4), 1/4.)) + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + >>> powm(extraprec(20)(powm)(A, -4), -1/4.) + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + >>> chop(powm(powm(A, 1+0.5j), 1/(1+0.5j))) + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + >>> powm(extraprec(5)(powm)(A, -1.5), -1/(1.5)) + [ 4.0 1.0 4.0] + [ 7.0 8.0 9.0] + [10.0 2.0 11.0] + + A Fibonacci-generating matrix:: + + >>> powm([[1,1],[1,0]], 10) + [89.0 55.0] + [55.0 34.0] + >>> fib(10) + 55.0 + >>> powm([[1,1],[1,0]], 6.5) + [(16.5166626964253 - 0.0121089837381789j) (10.2078589271083 + 0.0195927472575932j)] + [(10.2078589271083 + 0.0195927472575932j) (6.30880376931698 - 0.0317017309957721j)] + >>> (phi**6.5 - (1-phi)**6.5)/sqrt(5) + (10.2078589271083 - 0.0195927472575932j) + >>> powm([[1,1],[1,0]], 6.2) + [ (14.3076953002666 - 0.008222855781077j) (8.81733464837593 + 0.0133048601383712j)] + [(8.81733464837593 + 0.0133048601383712j) (5.49036065189071 - 0.0215277159194482j)] + >>> (phi**6.2 - (1-phi)**6.2)/sqrt(5) + (8.81733464837593 - 0.0133048601383712j) + + """ + A = ctx.matrix(A) + r = ctx.convert(r) + prec = ctx.prec + try: + ctx.prec += 10 + if ctx.isint(r): + v = A ** int(r) + elif ctx.isint(r*2): + y = int(r*2) + v = ctx.sqrtm(A) ** y + else: + v = ctx.expm(r*ctx.logm(A)) + finally: + ctx.prec = prec + v *= 1 + return v diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py new file mode 100644 index 0000000000000000000000000000000000000000..885d604203195b695183329acc637de91aeaf5ea --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/eigen.py @@ -0,0 +1,877 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +################################################################################################## +# module for the eigenvalue problem +# Copyright 2013 Timo Hartmann (thartmann15 at gmail.com) +# +# todo: +# - implement balancing +# - agressive early deflation +# +################################################################################################## + +""" +The eigenvalue problem +---------------------- + +This file contains routines for the eigenvalue problem. + +high level routines: + + hessenberg : reduction of a real or complex square matrix to upper Hessenberg form + schur : reduction of a real or complex square matrix to upper Schur form + eig : eigenvalues and eigenvectors of a real or complex square matrix + +low level routines: + + hessenberg_reduce_0 : reduction of a real or complex square matrix to upper Hessenberg form + hessenberg_reduce_1 : auxiliary routine to hessenberg_reduce_0 + qr_step : a single implicitly shifted QR step for an upper Hessenberg matrix + hessenberg_qr : Schur decomposition of an upper Hessenberg matrix + eig_tr_r : right eigenvectors of an upper triangular matrix + eig_tr_l : left eigenvectors of an upper triangular matrix +""" + +from ..libmp.backend import xrange + +class Eigen(object): + pass + +def defun(f): + setattr(Eigen, f.__name__, f) + return f + +def hessenberg_reduce_0(ctx, A, T): + """ + This routine computes the (upper) Hessenberg decomposition of a square matrix A. + Given A, an unitary matrix Q is calculated such that + + Q' A Q = H and Q' Q = Q Q' = 1 + + where H is an upper Hessenberg matrix, meaning that it only contains zeros + below the first subdiagonal. Here ' denotes the hermitian transpose (i.e. + transposition and conjugation). + + parameters: + A (input/output) On input, A contains the square matrix A of + dimension (n,n). On output, A contains a compressed representation + of Q and H. + T (output) An array of length n containing the first elements of + the Householder reflectors. + """ + + # internally we work with householder reflections from the right. + # let u be a row vector (i.e. u[i]=A[i,:i]). then + # Q is build up by reflectors of the type (1-v'v) where v is a suitable + # modification of u. these reflectors are applyed to A from the right. + # because we work with reflectors from the right we have to start with + # the bottom row of A and work then upwards (this corresponds to + # some kind of RQ decomposition). + # the first part of the vectors v (i.e. A[i,:(i-1)]) are stored as row vectors + # in the lower left part of A (excluding the diagonal and subdiagonal). + # the last entry of v is stored in T. + # the upper right part of A (including diagonal and subdiagonal) becomes H. + + + n = A.rows + if n <= 2: return + + for i in xrange(n-1, 1, -1): + + # scale the vector + + scale = 0 + for k in xrange(0, i): + scale += abs(ctx.re(A[i,k])) + abs(ctx.im(A[i,k])) + + scale_inv = 0 + if scale != 0: + scale_inv = 1 / scale + + if scale == 0 or ctx.isinf(scale_inv): + # sadly there are floating point numbers not equal to zero whose reciprocal is infinity + T[i] = 0 + A[i,i-1] = 0 + continue + + # calculate parameters for housholder transformation + + H = 0 + for k in xrange(0, i): + A[i,k] *= scale_inv + rr = ctx.re(A[i,k]) + ii = ctx.im(A[i,k]) + H += rr * rr + ii * ii + + F = A[i,i-1] + f = abs(F) + G = ctx.sqrt(H) + A[i,i-1] = - G * scale + + if f == 0: + T[i] = G + else: + ff = F / f + T[i] = F + G * ff + A[i,i-1] *= ff + + H += G * f + H = 1 / ctx.sqrt(H) + + T[i] *= H + for k in xrange(0, i - 1): + A[i,k] *= H + + for j in xrange(0, i): + # apply housholder transformation (from right) + + G = ctx.conj(T[i]) * A[j,i-1] + for k in xrange(0, i-1): + G += ctx.conj(A[i,k]) * A[j,k] + + A[j,i-1] -= G * T[i] + for k in xrange(0, i-1): + A[j,k] -= G * A[i,k] + + for j in xrange(0, n): + # apply housholder transformation (from left) + + G = T[i] * A[i-1,j] + for k in xrange(0, i-1): + G += A[i,k] * A[k,j] + + A[i-1,j] -= G * ctx.conj(T[i]) + for k in xrange(0, i-1): + A[k,j] -= G * ctx.conj(A[i,k]) + + + +def hessenberg_reduce_1(ctx, A, T): + """ + This routine forms the unitary matrix Q described in hessenberg_reduce_0. + + parameters: + A (input/output) On input, A is the same matrix as delivered by + hessenberg_reduce_0. On output, A is set to Q. + + T (input) On input, T is the same array as delivered by hessenberg_reduce_0. + """ + + n = A.rows + + if n == 1: + A[0,0] = 1 + return + + A[0,0] = A[1,1] = 1 + A[0,1] = A[1,0] = 0 + + for i in xrange(2, n): + if T[i] != 0: + + for j in xrange(0, i): + G = T[i] * A[i-1,j] + for k in xrange(0, i-1): + G += A[i,k] * A[k,j] + + A[i-1,j] -= G * ctx.conj(T[i]) + for k in xrange(0, i-1): + A[k,j] -= G * ctx.conj(A[i,k]) + + A[i,i] = 1 + for j in xrange(0, i): + A[j,i] = A[i,j] = 0 + + + +@defun +def hessenberg(ctx, A, overwrite_a = False): + """ + This routine computes the Hessenberg decomposition of a square matrix A. + Given A, an unitary matrix Q is determined such that + + Q' A Q = H and Q' Q = Q Q' = 1 + + where H is an upper right Hessenberg matrix. Here ' denotes the hermitian + transpose (i.e. transposition and conjugation). + + input: + A : a real or complex square matrix + overwrite_a : if true, allows modification of A which may improve + performance. if false, A is not modified. + + output: + Q : an unitary matrix + H : an upper right Hessenberg matrix + + example: + >>> from mpmath import mp + >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]]) + >>> Q, H = mp.hessenberg(A) + >>> mp.nprint(H, 3) # doctest:+SKIP + [ 3.15 2.23 4.44] + [-0.769 4.85 3.05] + [ 0.0 3.61 7.0] + >>> print(mp.chop(A - Q * H * Q.transpose_conj())) + [0.0 0.0 0.0] + [0.0 0.0 0.0] + [0.0 0.0 0.0] + + return value: (Q, H) + """ + + n = A.rows + + if n == 1: + return (ctx.matrix([[1]]), A) + + if not overwrite_a: + A = A.copy() + + T = ctx.matrix(n, 1) + + hessenberg_reduce_0(ctx, A, T) + Q = A.copy() + hessenberg_reduce_1(ctx, Q, T) + + for x in xrange(n): + for y in xrange(x+2, n): + A[y,x] = 0 + + return Q, A + + +########################################################################### + + +def qr_step(ctx, n0, n1, A, Q, shift): + """ + This subroutine executes a single implicitly shifted QR step applied to an + upper Hessenberg matrix A. Given A and shift as input, first an QR + decomposition is calculated: + + Q R = A - shift * 1 . + + The output is then following matrix: + + R Q + shift * 1 + + parameters: + n0, n1 (input) Two integers which specify the submatrix A[n0:n1,n0:n1] + on which this subroutine operators. The subdiagonal elements + to the left and below this submatrix must be deflated (i.e. zero). + following restriction is imposed: n1>=n0+2 + A (input/output) On input, A is an upper Hessenberg matrix. + On output, A is replaced by "R Q + shift * 1" + Q (input/output) The parameter Q is multiplied by the unitary matrix + Q arising from the QR decomposition. Q can also be false, in which + case the unitary matrix Q is not computated. + shift (input) a complex number specifying the shift. idealy close to an + eigenvalue of the bottemmost part of the submatrix A[n0:n1,n0:n1]. + + references: + Stoer, Bulirsch - Introduction to Numerical Analysis. + Kresser : Numerical Methods for General and Structured Eigenvalue Problems + """ + + # implicitly shifted and bulge chasing is explained at p.398/399 in "Stoer, Bulirsch - Introduction to Numerical Analysis" + # for bulge chasing see also "Watkins - The Matrix Eigenvalue Problem" sec.4.5,p.173 + + # the Givens rotation we used is determined as follows: let c,s be two complex + # numbers. then we have following relation: + # + # v = sqrt(|c|^2 + |s|^2) + # + # 1/v [ c~ s~] [c] = [v] + # [-s c ] [s] [0] + # + # the matrix on the left is our Givens rotation. + + n = A.rows + + # first step + + # calculate givens rotation + c = A[n0 ,n0] - shift + s = A[n0+1,n0] + + v = ctx.hypot(ctx.hypot(ctx.re(c), ctx.im(c)), ctx.hypot(ctx.re(s), ctx.im(s))) + + if v == 0: + v = 1 + c = 1 + s = 0 + else: + c /= v + s /= v + + cc = ctx.conj(c) + cs = ctx.conj(s) + + for k in xrange(n0, n): + # apply givens rotation from the left + x = A[n0 ,k] + y = A[n0+1,k] + A[n0 ,k] = cc * x + cs * y + A[n0+1,k] = c * y - s * x + + for k in xrange(min(n1, n0+3)): + # apply givens rotation from the right + x = A[k,n0 ] + y = A[k,n0+1] + A[k,n0 ] = c * x + s * y + A[k,n0+1] = cc * y - cs * x + + if not isinstance(Q, bool): + for k in xrange(n): + # eigenvectors + x = Q[k,n0 ] + y = Q[k,n0+1] + Q[k,n0 ] = c * x + s * y + Q[k,n0+1] = cc * y - cs * x + + # chase the bulge + + for j in xrange(n0, n1 - 2): + # calculate givens rotation + + c = A[j+1,j] + s = A[j+2,j] + + v = ctx.hypot(ctx.hypot(ctx.re(c), ctx.im(c)), ctx.hypot(ctx.re(s), ctx.im(s))) + + if v == 0: + A[j+1,j] = 0 + v = 1 + c = 1 + s = 0 + else: + A[j+1,j] = v + c /= v + s /= v + + A[j+2,j] = 0 + + cc = ctx.conj(c) + cs = ctx.conj(s) + + for k in xrange(j+1, n): + # apply givens rotation from the left + x = A[j+1,k] + y = A[j+2,k] + A[j+1,k] = cc * x + cs * y + A[j+2,k] = c * y - s * x + + for k in xrange(0, min(n1, j+4)): + # apply givens rotation from the right + x = A[k,j+1] + y = A[k,j+2] + A[k,j+1] = c * x + s * y + A[k,j+2] = cc * y - cs * x + + if not isinstance(Q, bool): + for k in xrange(0, n): + # eigenvectors + x = Q[k,j+1] + y = Q[k,j+2] + Q[k,j+1] = c * x + s * y + Q[k,j+2] = cc * y - cs * x + + + +def hessenberg_qr(ctx, A, Q): + """ + This routine computes the Schur decomposition of an upper Hessenberg matrix A. + Given A, an unitary matrix Q is determined such that + + Q' A Q = R and Q' Q = Q Q' = 1 + + where R is an upper right triangular matrix. Here ' denotes the hermitian + transpose (i.e. transposition and conjugation). + + parameters: + A (input/output) On input, A contains an upper Hessenberg matrix. + On output, A is replace by the upper right triangluar matrix R. + + Q (input/output) The parameter Q is multiplied by the unitary + matrix Q arising from the Schur decomposition. Q can also be + false, in which case the unitary matrix Q is not computated. + """ + + n = A.rows + + norm = 0 + for x in xrange(n): + for y in xrange(min(x+2, n)): + norm += ctx.re(A[y,x]) ** 2 + ctx.im(A[y,x]) ** 2 + norm = ctx.sqrt(norm) / n + + if norm == 0: + return + + n0 = 0 + n1 = n + + eps = ctx.eps / (100 * n) + maxits = ctx.dps * 4 + + its = totalits = 0 + + while 1: + # kressner p.32 algo 3 + # the active submatrix is A[n0:n1,n0:n1] + + k = n0 + + while k + 1 < n1: + s = abs(ctx.re(A[k,k])) + abs(ctx.im(A[k,k])) + abs(ctx.re(A[k+1,k+1])) + abs(ctx.im(A[k+1,k+1])) + if s < eps * norm: + s = norm + if abs(A[k+1,k]) < eps * s: + break + k += 1 + + if k + 1 < n1: + # deflation found at position (k+1, k) + + A[k+1,k] = 0 + n0 = k + 1 + + its = 0 + + if n0 + 1 >= n1: + # block of size at most two has converged + n0 = 0 + n1 = k + 1 + if n1 < 2: + # QR algorithm has converged + return + else: + if (its % 30) == 10: + # exceptional shift + shift = A[n1-1,n1-2] + elif (its % 30) == 20: + # exceptional shift + shift = abs(A[n1-1,n1-2]) + elif (its % 30) == 29: + # exceptional shift + shift = norm + else: + # A = [ a b ] det(x-A)=x*x-x*tr(A)+det(A) + # [ c d ] + # + # eigenvalues bad: (tr(A)+sqrt((tr(A))**2-4*det(A)))/2 + # bad because of cancellation if |c| is small and |a-d| is small, too. + # + # eigenvalues good: (a+d+sqrt((a-d)**2+4*b*c))/2 + + t = A[n1-2,n1-2] + A[n1-1,n1-1] + s = (A[n1-1,n1-1] - A[n1-2,n1-2]) ** 2 + 4 * A[n1-1,n1-2] * A[n1-2,n1-1] + if ctx.re(s) > 0: + s = ctx.sqrt(s) + else: + s = ctx.sqrt(-s) * 1j + a = (t + s) / 2 + b = (t - s) / 2 + if abs(A[n1-1,n1-1] - a) > abs(A[n1-1,n1-1] - b): + shift = b + else: + shift = a + + its += 1 + totalits += 1 + + qr_step(ctx, n0, n1, A, Q, shift) + + if its > maxits: + raise RuntimeError("qr: failed to converge after %d steps" % its) + + +@defun +def schur(ctx, A, overwrite_a = False): + """ + This routine computes the Schur decomposition of a square matrix A. + Given A, an unitary matrix Q is determined such that + + Q' A Q = R and Q' Q = Q Q' = 1 + + where R is an upper right triangular matrix. Here ' denotes the + hermitian transpose (i.e. transposition and conjugation). + + input: + A : a real or complex square matrix + overwrite_a : if true, allows modification of A which may improve + performance. if false, A is not modified. + + output: + Q : an unitary matrix + R : an upper right triangular matrix + + return value: (Q, R) + + example: + >>> from mpmath import mp + >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]]) + >>> Q, R = mp.schur(A) + >>> mp.nprint(R, 3) # doctest:+SKIP + [2.0 0.417 -2.53] + [0.0 4.0 -4.74] + [0.0 0.0 9.0] + >>> print(mp.chop(A - Q * R * Q.transpose_conj())) + [0.0 0.0 0.0] + [0.0 0.0 0.0] + [0.0 0.0 0.0] + + warning: The Schur decomposition is not unique. + """ + + n = A.rows + + if n == 1: + return (ctx.matrix([[1]]), A) + + if not overwrite_a: + A = A.copy() + + T = ctx.matrix(n, 1) + + hessenberg_reduce_0(ctx, A, T) + Q = A.copy() + hessenberg_reduce_1(ctx, Q, T) + + for x in xrange(n): + for y in xrange(x + 2, n): + A[y,x] = 0 + + hessenberg_qr(ctx, A, Q) + + return Q, A + + +def eig_tr_r(ctx, A): + """ + This routine calculates the right eigenvectors of an upper right triangular matrix. + + input: + A an upper right triangular matrix + + output: + ER a matrix whose columns form the right eigenvectors of A + + return value: ER + """ + + # this subroutine is inspired by the lapack routines ctrevc.f,clatrs.f + + n = A.rows + + ER = ctx.eye(n) + + eps = ctx.eps + + unfl = ctx.ldexp(ctx.one, -ctx.prec * 30) + # since mpmath effectively has no limits on the exponent, we simply scale doubles up + # original double has prec*20 + + smlnum = unfl * (n / eps) + simin = 1 / ctx.sqrt(eps) + + rmax = 1 + + for i in xrange(1, n): + s = A[i,i] + + smin = max(eps * abs(s), smlnum) + + for j in xrange(i - 1, -1, -1): + + r = 0 + for k in xrange(j + 1, i + 1): + r += A[j,k] * ER[k,i] + + t = A[j,j] - s + if abs(t) < smin: + t = smin + + r = -r / t + ER[j,i] = r + + rmax = max(rmax, abs(r)) + if rmax > simin: + for k in xrange(j, i+1): + ER[k,i] /= rmax + rmax = 1 + + if rmax != 1: + for k in xrange(0, i + 1): + ER[k,i] /= rmax + + return ER + +def eig_tr_l(ctx, A): + """ + This routine calculates the left eigenvectors of an upper right triangular matrix. + + input: + A an upper right triangular matrix + + output: + EL a matrix whose rows form the left eigenvectors of A + + return value: EL + """ + + n = A.rows + + EL = ctx.eye(n) + + eps = ctx.eps + + unfl = ctx.ldexp(ctx.one, -ctx.prec * 30) + # since mpmath effectively has no limits on the exponent, we simply scale doubles up + # original double has prec*20 + + smlnum = unfl * (n / eps) + simin = 1 / ctx.sqrt(eps) + + rmax = 1 + + for i in xrange(0, n - 1): + s = A[i,i] + + smin = max(eps * abs(s), smlnum) + + for j in xrange(i + 1, n): + + r = 0 + for k in xrange(i, j): + r += EL[i,k] * A[k,j] + + t = A[j,j] - s + if abs(t) < smin: + t = smin + + r = -r / t + EL[i,j] = r + + rmax = max(rmax, abs(r)) + if rmax > simin: + for k in xrange(i, j + 1): + EL[i,k] /= rmax + rmax = 1 + + if rmax != 1: + for k in xrange(i, n): + EL[i,k] /= rmax + + return EL + +@defun +def eig(ctx, A, left = False, right = True, overwrite_a = False): + """ + This routine computes the eigenvalues and optionally the left and right + eigenvectors of a square matrix A. Given A, a vector E and matrices ER + and EL are calculated such that + + A ER[:,i] = E[i] ER[:,i] + EL[i,:] A = EL[i,:] E[i] + + E contains the eigenvalues of A. The columns of ER contain the right eigenvectors + of A whereas the rows of EL contain the left eigenvectors. + + + input: + A : a real or complex square matrix of shape (n, n) + left : if true, the left eigenvectors are calculated. + right : if true, the right eigenvectors are calculated. + overwrite_a : if true, allows modification of A which may improve + performance. if false, A is not modified. + + output: + E : a list of length n containing the eigenvalues of A. + ER : a matrix whose columns contain the right eigenvectors of A. + EL : a matrix whose rows contain the left eigenvectors of A. + + return values: + E if left and right are both false. + (E, ER) if right is true and left is false. + (E, EL) if left is true and right is false. + (E, EL, ER) if left and right are true. + + + examples: + >>> from mpmath import mp + >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]]) + >>> E, ER = mp.eig(A) + >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0])) + [0.0] + [0.0] + [0.0] + + >>> E, EL, ER = mp.eig(A,left = True, right = True) + >>> E, EL, ER = mp.eig_sort(E, EL, ER) + >>> mp.nprint(E) + [2.0, 4.0, 9.0] + >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0])) + [0.0] + [0.0] + [0.0] + >>> print(mp.chop( EL[0,:] * A - EL[0,:] * E[0])) + [0.0 0.0 0.0] + + warning: + - If there are multiple eigenvalues, the eigenvectors do not necessarily + span the whole vectorspace, i.e. ER and EL may have not full rank. + Furthermore in that case the eigenvectors are numerical ill-conditioned. + - In the general case the eigenvalues have no natural order. + + see also: + - eigh (or eigsy, eighe) for the symmetric eigenvalue problem. + - eig_sort for sorting of eigenvalues and eigenvectors + """ + + n = A.rows + + if n == 1: + if left and (not right): + return ([A[0]], ctx.matrix([[1]])) + + if right and (not left): + return ([A[0]], ctx.matrix([[1]])) + + return ([A[0]], ctx.matrix([[1]]), ctx.matrix([[1]])) + + if not overwrite_a: + A = A.copy() + + T = ctx.zeros(n, 1) + + hessenberg_reduce_0(ctx, A, T) + + if left or right: + Q = A.copy() + hessenberg_reduce_1(ctx, Q, T) + else: + Q = False + + for x in xrange(n): + for y in xrange(x + 2, n): + A[y,x] = 0 + + hessenberg_qr(ctx, A, Q) + + E = [0 for i in xrange(n)] + for i in xrange(n): + E[i] = A[i,i] + + if not (left or right): + return E + + if left: + EL = eig_tr_l(ctx, A) + EL = EL * Q.transpose_conj() + + if right: + ER = eig_tr_r(ctx, A) + ER = Q * ER + + if left and (not right): + return (E, EL) + + if right and (not left): + return (E, ER) + + return (E, EL, ER) + +@defun +def eig_sort(ctx, E, EL = False, ER = False, f = "real"): + """ + This routine sorts the eigenvalues and eigenvectors delivered by ``eig``. + + parameters: + E : the eigenvalues as delivered by eig + EL : the left eigenvectors as delivered by eig, or false + ER : the right eigenvectors as delivered by eig, or false + f : either a string ("real" sort by increasing real part, "imag" sort by + increasing imag part, "abs" sort by absolute value) or a function + mapping complexs to the reals, i.e. ``f = lambda x: -mp.re(x) `` + would sort the eigenvalues by decreasing real part. + + return values: + E if EL and ER are both false. + (E, ER) if ER is not false and left is false. + (E, EL) if EL is not false and right is false. + (E, EL, ER) if EL and ER are not false. + + example: + >>> from mpmath import mp + >>> A = mp.matrix([[3, -1, 2], [2, 5, -5], [-2, -3, 7]]) + >>> E, EL, ER = mp.eig(A,left = True, right = True) + >>> E, EL, ER = mp.eig_sort(E, EL, ER) + >>> mp.nprint(E) + [2.0, 4.0, 9.0] + >>> E, EL, ER = mp.eig_sort(E, EL, ER,f = lambda x: -mp.re(x)) + >>> mp.nprint(E) + [9.0, 4.0, 2.0] + >>> print(mp.chop(A * ER[:,0] - E[0] * ER[:,0])) + [0.0] + [0.0] + [0.0] + >>> print(mp.chop( EL[0,:] * A - EL[0,:] * E[0])) + [0.0 0.0 0.0] + """ + + if isinstance(f, str): + if f == "real": + f = ctx.re + elif f == "imag": + f = ctx.im + elif f == "abs": + f = abs + else: + raise RuntimeError("unknown function %s" % f) + + n = len(E) + + # Sort eigenvalues (bubble-sort) + + for i in xrange(n): + imax = i + s = f(E[i]) # s is the current maximal element + + for j in xrange(i + 1, n): + c = f(E[j]) + if c < s: + s = c + imax = j + + if imax != i: + # swap eigenvalues + + z = E[i] + E[i] = E[imax] + E[imax] = z + + if not isinstance(EL, bool): + for j in xrange(n): + z = EL[i,j] + EL[i,j] = EL[imax,j] + EL[imax,j] = z + + if not isinstance(ER, bool): + for j in xrange(n): + z = ER[j,i] + ER[j,i] = ER[j,imax] + ER[j,imax] = z + + if isinstance(EL, bool) and isinstance(ER, bool): + return E + + if isinstance(EL, bool) and not(isinstance(ER, bool)): + return (E, ER) + + if isinstance(ER, bool) and not(isinstance(EL, bool)): + return (E, EL) + + return (E, EL, ER) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py new file mode 100644 index 0000000000000000000000000000000000000000..e2fe643e809822e3d05a52b73c965edb622f9af9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/linalg.py @@ -0,0 +1,790 @@ +""" +Linear algebra +-------------- + +Linear equations +................ + +Basic linear algebra is implemented; you can for example solve the linear +equation system:: + + x + 2*y = -10 + 3*x + 4*y = 10 + +using ``lu_solve``:: + + >>> from mpmath import * + >>> mp.pretty = False + >>> A = matrix([[1, 2], [3, 4]]) + >>> b = matrix([-10, 10]) + >>> x = lu_solve(A, b) + >>> x + matrix( + [['30.0'], + ['-20.0']]) + +If you don't trust the result, use ``residual`` to calculate the residual ||A*x-b||:: + + >>> residual(A, x, b) + matrix( + [['3.46944695195361e-18'], + ['3.46944695195361e-18']]) + >>> str(eps) + '2.22044604925031e-16' + +As you can see, the solution is quite accurate. The error is caused by the +inaccuracy of the internal floating point arithmetic. Though, it's even smaller +than the current machine epsilon, which basically means you can trust the +result. + +If you need more speed, use NumPy, or ``fp.lu_solve`` for a floating-point computation. + + >>> fp.lu_solve(A, b) # doctest: +ELLIPSIS + matrix(...) + +``lu_solve`` accepts overdetermined systems. It is usually not possible to solve +such systems, so the residual is minimized instead. Internally this is done +using Cholesky decomposition to compute a least squares approximation. This means +that that ``lu_solve`` will square the errors. If you can't afford this, use +``qr_solve`` instead. It is twice as slow but more accurate, and it calculates +the residual automatically. + + +Matrix factorization +.................... + +The function ``lu`` computes an explicit LU factorization of a matrix:: + + >>> P, L, U = lu(matrix([[0,2,3],[4,5,6],[7,8,9]])) + >>> print(P) + [0.0 0.0 1.0] + [1.0 0.0 0.0] + [0.0 1.0 0.0] + >>> print(L) + [ 1.0 0.0 0.0] + [ 0.0 1.0 0.0] + [0.571428571428571 0.214285714285714 1.0] + >>> print(U) + [7.0 8.0 9.0] + [0.0 2.0 3.0] + [0.0 0.0 0.214285714285714] + >>> print(P.T*L*U) + [0.0 2.0 3.0] + [4.0 5.0 6.0] + [7.0 8.0 9.0] + +Interval matrices +----------------- + +Matrices may contain interval elements. This allows one to perform +basic linear algebra operations such as matrix multiplication +and equation solving with rigorous error bounds:: + + >>> a = iv.matrix([['0.1','0.3','1.0'], + ... ['7.1','5.5','4.8'], + ... ['3.2','4.4','5.6']]) + >>> + >>> b = iv.matrix(['4','0.6','0.5']) + >>> c = iv.lu_solve(a, b) + >>> print(c) + [ [5.2582327113062568605927528666, 5.25823271130625686059275702219]] + [[-13.1550493962678375411635581388, -13.1550493962678375411635540152]] + [ [7.42069154774972557628979076189, 7.42069154774972557628979190734]] + >>> print(a*c) + [ [3.99999999999999999999999844904, 4.00000000000000000000000155096]] + [[0.599999999999999999999968898009, 0.600000000000000000000031763736]] + [[0.499999999999999999999979320485, 0.500000000000000000000020679515]] +""" + +# TODO: +# *implement high-level qr() +# *test unitvector +# *iterative solving + +from copy import copy + +from ..libmp.backend import xrange + +class LinearAlgebraMethods(object): + + def LU_decomp(ctx, A, overwrite=False, use_cache=True): + """ + LU-factorization of a n*n matrix using the Gauss algorithm. + Returns L and U in one matrix and the pivot indices. + + Use overwrite to specify whether A will be overwritten with L and U. + """ + if not A.rows == A.cols: + raise ValueError('need n*n matrix') + # get from cache if possible + if use_cache and isinstance(A, ctx.matrix) and A._LU: + return A._LU + if not overwrite: + orig = A + A = A.copy() + tol = ctx.absmin(ctx.mnorm(A,1) * ctx.eps) # each pivot element has to be bigger + n = A.rows + p = [None]*(n - 1) + for j in xrange(n - 1): + # pivoting, choose max(abs(reciprocal row sum)*abs(pivot element)) + biggest = 0 + for k in xrange(j, n): + s = ctx.fsum([ctx.absmin(A[k,l]) for l in xrange(j, n)]) + if ctx.absmin(s) <= tol: + raise ZeroDivisionError('matrix is numerically singular') + current = 1/s * ctx.absmin(A[k,j]) + if current > biggest: # TODO: what if equal? + biggest = current + p[j] = k + # swap rows according to p + ctx.swap_row(A, j, p[j]) + if ctx.absmin(A[j,j]) <= tol: + raise ZeroDivisionError('matrix is numerically singular') + # calculate elimination factors and add rows + for i in xrange(j + 1, n): + A[i,j] /= A[j,j] + for k in xrange(j + 1, n): + A[i,k] -= A[i,j]*A[j,k] + if ctx.absmin(A[n - 1,n - 1]) <= tol: + raise ZeroDivisionError('matrix is numerically singular') + # cache decomposition + if not overwrite and isinstance(orig, ctx.matrix): + orig._LU = (A, p) + return A, p + + def L_solve(ctx, L, b, p=None): + """ + Solve the lower part of a LU factorized matrix for y. + """ + if L.rows != L.cols: + raise RuntimeError("need n*n matrix") + n = L.rows + if len(b) != n: + raise ValueError("Value should be equal to n") + b = copy(b) + if p: # swap b according to p + for k in xrange(0, len(p)): + ctx.swap_row(b, k, p[k]) + # solve + for i in xrange(1, n): + for j in xrange(i): + b[i] -= L[i,j] * b[j] + return b + + def U_solve(ctx, U, y): + """ + Solve the upper part of a LU factorized matrix for x. + """ + if U.rows != U.cols: + raise RuntimeError("need n*n matrix") + n = U.rows + if len(y) != n: + raise ValueError("Value should be equal to n") + x = copy(y) + for i in xrange(n - 1, -1, -1): + for j in xrange(i + 1, n): + x[i] -= U[i,j] * x[j] + x[i] /= U[i,i] + return x + + def lu_solve(ctx, A, b, **kwargs): + """ + Ax = b => x + + Solve a determined or overdetermined linear equations system. + Fast LU decomposition is used, which is less accurate than QR decomposition + (especially for overdetermined systems), but it's twice as efficient. + Use qr_solve if you want more precision or have to solve a very ill- + conditioned system. + + If you specify real=True, it does not check for overdeterminded complex + systems. + """ + prec = ctx.prec + try: + ctx.prec += 10 + # do not overwrite A nor b + A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy() + if A.rows < A.cols: + raise ValueError('cannot solve underdetermined system') + if A.rows > A.cols: + # use least-squares method if overdetermined + # (this increases errors) + AH = A.H + A = AH * A + b = AH * b + if (kwargs.get('real', False) or + not sum(type(i) is ctx.mpc for i in A)): + # TODO: necessary to check also b? + x = ctx.cholesky_solve(A, b) + else: + x = ctx.lu_solve(A, b) + else: + # LU factorization + A, p = ctx.LU_decomp(A) + b = ctx.L_solve(A, b, p) + x = ctx.U_solve(A, b) + finally: + ctx.prec = prec + return x + + def improve_solution(ctx, A, x, b, maxsteps=1): + """ + Improve a solution to a linear equation system iteratively. + + This re-uses the LU decomposition and is thus cheap. + Usually 3 up to 4 iterations are giving the maximal improvement. + """ + if A.rows != A.cols: + raise RuntimeError("need n*n matrix") # TODO: really? + for _ in xrange(maxsteps): + r = ctx.residual(A, x, b) + if ctx.norm(r, 2) < 10*ctx.eps: + break + # this uses cached LU decomposition and is thus cheap + dx = ctx.lu_solve(A, -r) + x += dx + return x + + def lu(ctx, A): + """ + A -> P, L, U + + LU factorisation of a square matrix A. L is the lower, U the upper part. + P is the permutation matrix indicating the row swaps. + + P*A = L*U + + If you need efficiency, use the low-level method LU_decomp instead, it's + much more memory efficient. + """ + # get factorization + A, p = ctx.LU_decomp(A) + n = A.rows + L = ctx.matrix(n) + U = ctx.matrix(n) + for i in xrange(n): + for j in xrange(n): + if i > j: + L[i,j] = A[i,j] + elif i == j: + L[i,j] = 1 + U[i,j] = A[i,j] + else: + U[i,j] = A[i,j] + # calculate permutation matrix + P = ctx.eye(n) + for k in xrange(len(p)): + ctx.swap_row(P, k, p[k]) + return P, L, U + + def unitvector(ctx, n, i): + """ + Return the i-th n-dimensional unit vector. + """ + assert 0 < i <= n, 'this unit vector does not exist' + return [ctx.zero]*(i-1) + [ctx.one] + [ctx.zero]*(n-i) + + def inverse(ctx, A, **kwargs): + """ + Calculate the inverse of a matrix. + + If you want to solve an equation system Ax = b, it's recommended to use + solve(A, b) instead, it's about 3 times more efficient. + """ + prec = ctx.prec + try: + ctx.prec += 10 + # do not overwrite A + A = ctx.matrix(A, **kwargs).copy() + n = A.rows + # get LU factorisation + A, p = ctx.LU_decomp(A) + cols = [] + # calculate unit vectors and solve corresponding system to get columns + for i in xrange(1, n + 1): + e = ctx.unitvector(n, i) + y = ctx.L_solve(A, e, p) + cols.append(ctx.U_solve(A, y)) + # convert columns to matrix + inv = [] + for i in xrange(n): + row = [] + for j in xrange(n): + row.append(cols[j][i]) + inv.append(row) + result = ctx.matrix(inv, **kwargs) + finally: + ctx.prec = prec + return result + + def householder(ctx, A): + """ + (A|b) -> H, p, x, res + + (A|b) is the coefficient matrix with left hand side of an optionally + overdetermined linear equation system. + H and p contain all information about the transformation matrices. + x is the solution, res the residual. + """ + if not isinstance(A, ctx.matrix): + raise TypeError("A should be a type of ctx.matrix") + m = A.rows + n = A.cols + if m < n - 1: + raise RuntimeError("Columns should not be less than rows") + # calculate Householder matrix + p = [] + for j in xrange(0, n - 1): + s = ctx.fsum(abs(A[i,j])**2 for i in xrange(j, m)) + if not abs(s) > ctx.eps: + raise ValueError('matrix is numerically singular') + p.append(-ctx.sign(ctx.re(A[j,j])) * ctx.sqrt(s)) + kappa = ctx.one / (s - p[j] * A[j,j]) + A[j,j] -= p[j] + for k in xrange(j+1, n): + y = ctx.fsum(ctx.conj(A[i,j]) * A[i,k] for i in xrange(j, m)) * kappa + for i in xrange(j, m): + A[i,k] -= A[i,j] * y + # solve Rx = c1 + x = [A[i,n - 1] for i in xrange(n - 1)] + for i in xrange(n - 2, -1, -1): + x[i] -= ctx.fsum(A[i,j] * x[j] for j in xrange(i + 1, n - 1)) + x[i] /= p[i] + # calculate residual + if not m == n - 1: + r = [A[m-1-i, n-1] for i in xrange(m - n + 1)] + else: + # determined system, residual should be 0 + r = [0]*m # maybe a bad idea, changing r[i] will change all elements + return A, p, x, r + + #def qr(ctx, A): + # """ + # A -> Q, R + # + # QR factorisation of a square matrix A using Householder decomposition. + # Q is orthogonal, this leads to very few numerical errors. + # + # A = Q*R + # """ + # H, p, x, res = householder(A) + # TODO: implement this + + def residual(ctx, A, x, b, **kwargs): + """ + Calculate the residual of a solution to a linear equation system. + + r = A*x - b for A*x = b + """ + oldprec = ctx.prec + try: + ctx.prec *= 2 + A, x, b = ctx.matrix(A, **kwargs), ctx.matrix(x, **kwargs), ctx.matrix(b, **kwargs) + return A*x - b + finally: + ctx.prec = oldprec + + def qr_solve(ctx, A, b, norm=None, **kwargs): + """ + Ax = b => x, ||Ax - b|| + + Solve a determined or overdetermined linear equations system and + calculate the norm of the residual (error). + QR decomposition using Householder factorization is applied, which gives very + accurate results even for ill-conditioned matrices. qr_solve is twice as + efficient. + """ + if norm is None: + norm = ctx.norm + prec = ctx.prec + try: + ctx.prec += 10 + # do not overwrite A nor b + A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy() + if A.rows < A.cols: + raise ValueError('cannot solve underdetermined system') + H, p, x, r = ctx.householder(ctx.extend(A, b)) + res = ctx.norm(r) + # calculate residual "manually" for determined systems + if res == 0: + res = ctx.norm(ctx.residual(A, x, b)) + return ctx.matrix(x, **kwargs), res + finally: + ctx.prec = prec + + def cholesky(ctx, A, tol=None): + r""" + Cholesky decomposition of a symmetric positive-definite matrix `A`. + Returns a lower triangular matrix `L` such that `A = L \times L^T`. + More generally, for a complex Hermitian positive-definite matrix, + a Cholesky decomposition satisfying `A = L \times L^H` is returned. + + The Cholesky decomposition can be used to solve linear equation + systems twice as efficiently as LU decomposition, or to + test whether `A` is positive-definite. + + The optional parameter ``tol`` determines the tolerance for + verifying positive-definiteness. + + **Examples** + + Cholesky decomposition of a positive-definite symmetric matrix:: + + >>> from mpmath import * + >>> mp.dps = 25; mp.pretty = True + >>> A = eye(3) + hilbert(3) + >>> nprint(A) + [ 2.0 0.5 0.333333] + [ 0.5 1.33333 0.25] + [0.333333 0.25 1.2] + >>> L = cholesky(A) + >>> nprint(L) + [ 1.41421 0.0 0.0] + [0.353553 1.09924 0.0] + [0.235702 0.15162 1.05899] + >>> chop(A - L*L.T) + [0.0 0.0 0.0] + [0.0 0.0 0.0] + [0.0 0.0 0.0] + + Cholesky decomposition of a Hermitian matrix:: + + >>> A = eye(3) + matrix([[0,0.25j,-0.5j],[-0.25j,0,0],[0.5j,0,0]]) + >>> L = cholesky(A) + >>> nprint(L) + [ 1.0 0.0 0.0] + [(0.0 - 0.25j) (0.968246 + 0.0j) 0.0] + [ (0.0 + 0.5j) (0.129099 + 0.0j) (0.856349 + 0.0j)] + >>> chop(A - L*L.H) + [0.0 0.0 0.0] + [0.0 0.0 0.0] + [0.0 0.0 0.0] + + Attempted Cholesky decomposition of a matrix that is not positive + definite:: + + >>> A = -eye(3) + hilbert(3) + >>> L = cholesky(A) + Traceback (most recent call last): + ... + ValueError: matrix is not positive-definite + + **References** + + 1. [Wikipedia]_ http://en.wikipedia.org/wiki/Cholesky_decomposition + + """ + if not isinstance(A, ctx.matrix): + raise RuntimeError("A should be a type of ctx.matrix") + if not A.rows == A.cols: + raise ValueError('need n*n matrix') + if tol is None: + tol = +ctx.eps + n = A.rows + L = ctx.matrix(n) + for j in xrange(n): + c = ctx.re(A[j,j]) + if abs(c-A[j,j]) > tol: + raise ValueError('matrix is not Hermitian') + s = c - ctx.fsum((L[j,k] for k in xrange(j)), + absolute=True, squared=True) + if s < tol: + raise ValueError('matrix is not positive-definite') + L[j,j] = ctx.sqrt(s) + for i in xrange(j, n): + it1 = (L[i,k] for k in xrange(j)) + it2 = (L[j,k] for k in xrange(j)) + t = ctx.fdot(it1, it2, conjugate=True) + L[i,j] = (A[i,j] - t) / L[j,j] + return L + + def cholesky_solve(ctx, A, b, **kwargs): + """ + Ax = b => x + + Solve a symmetric positive-definite linear equation system. + This is twice as efficient as lu_solve. + + Typical use cases: + * A.T*A + * Hessian matrix + * differential equations + """ + prec = ctx.prec + try: + ctx.prec += 10 + # do not overwrite A nor b + A, b = ctx.matrix(A, **kwargs).copy(), ctx.matrix(b, **kwargs).copy() + if A.rows != A.cols: + raise ValueError('can only solve determined system') + # Cholesky factorization + L = ctx.cholesky(A) + # solve + n = L.rows + if len(b) != n: + raise ValueError("Value should be equal to n") + for i in xrange(n): + b[i] -= ctx.fsum(L[i,j] * b[j] for j in xrange(i)) + b[i] /= L[i,i] + x = ctx.U_solve(L.T, b) + return x + finally: + ctx.prec = prec + + def det(ctx, A): + """ + Calculate the determinant of a matrix. + """ + prec = ctx.prec + try: + # do not overwrite A + A = ctx.matrix(A).copy() + # use LU factorization to calculate determinant + try: + R, p = ctx.LU_decomp(A) + except ZeroDivisionError: + return 0 + z = 1 + for i, e in enumerate(p): + if i != e: + z *= -1 + for i in xrange(A.rows): + z *= R[i,i] + return z + finally: + ctx.prec = prec + + def cond(ctx, A, norm=None): + """ + Calculate the condition number of a matrix using a specified matrix norm. + + The condition number estimates the sensitivity of a matrix to errors. + Example: small input errors for ill-conditioned coefficient matrices + alter the solution of the system dramatically. + + For ill-conditioned matrices it's recommended to use qr_solve() instead + of lu_solve(). This does not help with input errors however, it just avoids + to add additional errors. + + Definition: cond(A) = ||A|| * ||A**-1|| + """ + if norm is None: + norm = lambda x: ctx.mnorm(x,1) + return norm(A) * norm(ctx.inverse(A)) + + def lu_solve_mat(ctx, a, b): + """Solve a * x = b where a and b are matrices.""" + r = ctx.matrix(a.rows, b.cols) + for i in range(b.cols): + c = ctx.lu_solve(a, b.column(i)) + for j in range(len(c)): + r[j, i] = c[j] + return r + + def qr(ctx, A, mode = 'full', edps = 10): + """ + Compute a QR factorization $A = QR$ where + A is an m x n matrix of real or complex numbers where m >= n + + mode has following meanings: + (1) mode = 'raw' returns two matrixes (A, tau) in the + internal format used by LAPACK + (2) mode = 'skinny' returns the leading n columns of Q + and n rows of R + (3) Any other value returns the leading m columns of Q + and m rows of R + + edps is the increase in mp precision used for calculations + + **Examples** + + >>> from mpmath import * + >>> mp.dps = 15 + >>> mp.pretty = True + >>> A = matrix([[1, 2], [3, 4], [1, 1]]) + >>> Q, R = qr(A) + >>> Q + [-0.301511344577764 0.861640436855329 0.408248290463863] + [-0.904534033733291 -0.123091490979333 -0.408248290463863] + [-0.301511344577764 -0.492365963917331 0.816496580927726] + >>> R + [-3.3166247903554 -4.52267016866645] + [ 0.0 0.738548945875996] + [ 0.0 0.0] + >>> Q * R + [1.0 2.0] + [3.0 4.0] + [1.0 1.0] + >>> chop(Q.T * Q) + [1.0 0.0 0.0] + [0.0 1.0 0.0] + [0.0 0.0 1.0] + >>> B = matrix([[1+0j, 2-3j], [3+j, 4+5j]]) + >>> Q, R = qr(B) + >>> nprint(Q) + [ (-0.301511 + 0.0j) (0.0695795 - 0.95092j)] + [(-0.904534 - 0.301511j) (-0.115966 + 0.278318j)] + >>> nprint(R) + [(-3.31662 + 0.0j) (-5.72872 - 2.41209j)] + [ 0.0 (3.91965 + 0.0j)] + >>> Q * R + [(1.0 + 0.0j) (2.0 - 3.0j)] + [(3.0 + 1.0j) (4.0 + 5.0j)] + >>> chop(Q.T * Q.conjugate()) + [1.0 0.0] + [0.0 1.0] + + """ + + # check values before continuing + assert isinstance(A, ctx.matrix) + m = A.rows + n = A.cols + assert n >= 0 + assert m >= n + assert edps >= 0 + + # check for complex data type + cmplx = any(type(x) is ctx.mpc for x in A) + + # temporarily increase the precision and initialize + with ctx.extradps(edps): + tau = ctx.matrix(n,1) + A = A.copy() + + # --------------- + # FACTOR MATRIX A + # --------------- + if cmplx: + one = ctx.mpc('1.0', '0.0') + zero = ctx.mpc('0.0', '0.0') + rzero = ctx.mpf('0.0') + + # main loop to factor A (complex) + for j in xrange(0, n): + alpha = A[j,j] + alphr = ctx.re(alpha) + alphi = ctx.im(alpha) + + if (m-j) >= 2: + xnorm = ctx.fsum( A[i,j]*ctx.conj(A[i,j]) for i in xrange(j+1, m) ) + xnorm = ctx.re( ctx.sqrt(xnorm) ) + else: + xnorm = rzero + + if (xnorm == rzero) and (alphi == rzero): + tau[j] = zero + continue + + if alphr < rzero: + beta = ctx.sqrt(alphr**2 + alphi**2 + xnorm**2) + else: + beta = -ctx.sqrt(alphr**2 + alphi**2 + xnorm**2) + + tau[j] = ctx.mpc( (beta - alphr) / beta, -alphi / beta ) + t = -ctx.conj(tau[j]) + za = one / (alpha - beta) + + for i in xrange(j+1, m): + A[i,j] *= za + + A[j,j] = one + for k in xrange(j+1, n): + y = ctx.fsum(A[i,j] * ctx.conj(A[i,k]) for i in xrange(j, m)) + temp = t * ctx.conj(y) + for i in xrange(j, m): + A[i,k] += A[i,j] * temp + + A[j,j] = ctx.mpc(beta, '0.0') + else: + one = ctx.mpf('1.0') + zero = ctx.mpf('0.0') + + # main loop to factor A (real) + for j in xrange(0, n): + alpha = A[j,j] + + if (m-j) > 2: + xnorm = ctx.fsum( (A[i,j])**2 for i in xrange(j+1, m) ) + xnorm = ctx.sqrt(xnorm) + elif (m-j) == 2: + xnorm = abs( A[m-1,j] ) + else: + xnorm = zero + + if xnorm == zero: + tau[j] = zero + continue + + if alpha < zero: + beta = ctx.sqrt(alpha**2 + xnorm**2) + else: + beta = -ctx.sqrt(alpha**2 + xnorm**2) + + tau[j] = (beta - alpha) / beta + t = -tau[j] + da = one / (alpha - beta) + + for i in xrange(j+1, m): + A[i,j] *= da + + A[j,j] = one + for k in xrange(j+1, n): + y = ctx.fsum( A[i,j] * A[i,k] for i in xrange(j, m) ) + temp = t * y + for i in xrange(j,m): + A[i,k] += A[i,j] * temp + + A[j,j] = beta + + # return factorization in same internal format as LAPACK + if (mode == 'raw') or (mode == 'RAW'): + return A, tau + + # ---------------------------------- + # FORM Q USING BACKWARD ACCUMULATION + # ---------------------------------- + + # form R before the values are overwritten + R = A.copy() + for j in xrange(0, n): + for i in xrange(j+1, m): + R[i,j] = zero + + # set the value of p (number of columns of Q to return) + p = m + if (mode == 'skinny') or (mode == 'SKINNY'): + p = n + + # add columns to A if needed and initialize + A.cols += (p-n) + for j in xrange(0, p): + A[j,j] = one + for i in xrange(0, j): + A[i,j] = zero + + # main loop to form Q + for j in xrange(n-1, -1, -1): + t = -tau[j] + A[j,j] += t + + for k in xrange(j+1, p): + if cmplx: + y = ctx.fsum(A[i,j] * ctx.conj(A[i,k]) for i in xrange(j+1, m)) + temp = t * ctx.conj(y) + else: + y = ctx.fsum(A[i,j] * A[i,k] for i in xrange(j+1, m)) + temp = t * y + A[j,k] = temp + for i in xrange(j+1, m): + A[i,k] += A[i,j] * temp + + for i in xrange(j+1, m): + A[i, j] *= t + + return A, R[0:p,0:n] + + # ------------------ + # END OF FUNCTION QR + # ------------------ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py new file mode 100644 index 0000000000000000000000000000000000000000..a97d5a9ca7e173195386dc7cb60860a826ab6a97 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/matrices/matrices.py @@ -0,0 +1,1005 @@ +from ..libmp.backend import xrange +import warnings + +# TODO: interpret list as vectors (for multiplication) + +rowsep = '\n' +colsep = ' ' + +class _matrix(object): + """ + Numerical matrix. + + Specify the dimensions or the data as a nested list. + Elements default to zero. + Use a flat list to create a column vector easily. + + The datatype of the context (mpf for mp, mpi for iv, and float for fp) is used to store the data. + + Creating matrices + ----------------- + + Matrices in mpmath are implemented using dictionaries. Only non-zero values + are stored, so it is cheap to represent sparse matrices. + + The most basic way to create one is to use the ``matrix`` class directly. + You can create an empty matrix specifying the dimensions: + + >>> from mpmath import * + >>> mp.dps = 15 + >>> matrix(2) + matrix( + [['0.0', '0.0'], + ['0.0', '0.0']]) + >>> matrix(2, 3) + matrix( + [['0.0', '0.0', '0.0'], + ['0.0', '0.0', '0.0']]) + + Calling ``matrix`` with one dimension will create a square matrix. + + To access the dimensions of a matrix, use the ``rows`` or ``cols`` keyword: + + >>> A = matrix(3, 2) + >>> A + matrix( + [['0.0', '0.0'], + ['0.0', '0.0'], + ['0.0', '0.0']]) + >>> A.rows + 3 + >>> A.cols + 2 + + You can also change the dimension of an existing matrix. This will set the + new elements to 0. If the new dimension is smaller than before, the + concerning elements are discarded: + + >>> A.rows = 2 + >>> A + matrix( + [['0.0', '0.0'], + ['0.0', '0.0']]) + + Internally ``mpmathify`` is used every time an element is set. This + is done using the syntax A[row,column], counting from 0: + + >>> A = matrix(2) + >>> A[1,1] = 1 + 1j + >>> A + matrix( + [['0.0', '0.0'], + ['0.0', mpc(real='1.0', imag='1.0')]]) + + A more comfortable way to create a matrix lets you use nested lists: + + >>> matrix([[1, 2], [3, 4]]) + matrix( + [['1.0', '2.0'], + ['3.0', '4.0']]) + + Convenient advanced functions are available for creating various standard + matrices, see ``zeros``, ``ones``, ``diag``, ``eye``, ``randmatrix`` and + ``hilbert``. + + Vectors + ....... + + Vectors may also be represented by the ``matrix`` class (with rows = 1 or cols = 1). + For vectors there are some things which make life easier. A column vector can + be created using a flat list, a row vectors using an almost flat nested list:: + + >>> matrix([1, 2, 3]) + matrix( + [['1.0'], + ['2.0'], + ['3.0']]) + >>> matrix([[1, 2, 3]]) + matrix( + [['1.0', '2.0', '3.0']]) + + Optionally vectors can be accessed like lists, using only a single index:: + + >>> x = matrix([1, 2, 3]) + >>> x[1] + mpf('2.0') + >>> x[1,0] + mpf('2.0') + + Other + ..... + + Like you probably expected, matrices can be printed:: + + >>> print randmatrix(3) # doctest:+SKIP + [ 0.782963853573023 0.802057689719883 0.427895717335467] + [0.0541876859348597 0.708243266653103 0.615134039977379] + [ 0.856151514955773 0.544759264818486 0.686210904770947] + + Use ``nstr`` or ``nprint`` to specify the number of digits to print:: + + >>> nprint(randmatrix(5), 3) # doctest:+SKIP + [2.07e-1 1.66e-1 5.06e-1 1.89e-1 8.29e-1] + [6.62e-1 6.55e-1 4.47e-1 4.82e-1 2.06e-2] + [4.33e-1 7.75e-1 6.93e-2 2.86e-1 5.71e-1] + [1.01e-1 2.53e-1 6.13e-1 3.32e-1 2.59e-1] + [1.56e-1 7.27e-2 6.05e-1 6.67e-2 2.79e-1] + + As matrices are mutable, you will need to copy them sometimes:: + + >>> A = matrix(2) + >>> A + matrix( + [['0.0', '0.0'], + ['0.0', '0.0']]) + >>> B = A.copy() + >>> B[0,0] = 1 + >>> B + matrix( + [['1.0', '0.0'], + ['0.0', '0.0']]) + >>> A + matrix( + [['0.0', '0.0'], + ['0.0', '0.0']]) + + Finally, it is possible to convert a matrix to a nested list. This is very useful, + as most Python libraries involving matrices or arrays (namely NumPy or SymPy) + support this format:: + + >>> B.tolist() + [[mpf('1.0'), mpf('0.0')], [mpf('0.0'), mpf('0.0')]] + + + Matrix operations + ----------------- + + You can add and subtract matrices of compatible dimensions:: + + >>> A = matrix([[1, 2], [3, 4]]) + >>> B = matrix([[-2, 4], [5, 9]]) + >>> A + B + matrix( + [['-1.0', '6.0'], + ['8.0', '13.0']]) + >>> A - B + matrix( + [['3.0', '-2.0'], + ['-2.0', '-5.0']]) + >>> A + ones(3) # doctest:+ELLIPSIS + Traceback (most recent call last): + ... + ValueError: incompatible dimensions for addition + + It is possible to multiply or add matrices and scalars. In the latter case the + operation will be done element-wise:: + + >>> A * 2 + matrix( + [['2.0', '4.0'], + ['6.0', '8.0']]) + >>> A / 4 + matrix( + [['0.25', '0.5'], + ['0.75', '1.0']]) + >>> A - 1 + matrix( + [['0.0', '1.0'], + ['2.0', '3.0']]) + + Of course you can perform matrix multiplication, if the dimensions are + compatible, using ``@`` (for Python >= 3.5) or ``*``. For clarity, ``@`` is + recommended (`PEP 465 `), because + the meaning of ``*`` is different in many other Python libraries such as NumPy. + + >>> A @ B # doctest:+SKIP + matrix( + [['8.0', '22.0'], + ['14.0', '48.0']]) + >>> A * B # same as A @ B + matrix( + [['8.0', '22.0'], + ['14.0', '48.0']]) + >>> matrix([[1, 2, 3]]) * matrix([[-6], [7], [-2]]) + matrix( + [['2.0']]) + + .. + COMMENT: TODO: the above "doctest:+SKIP" may be removed as soon as we + have dropped support for Python 3.5 and below. + + You can raise powers of square matrices:: + + >>> A**2 + matrix( + [['7.0', '10.0'], + ['15.0', '22.0']]) + + Negative powers will calculate the inverse:: + + >>> A**-1 + matrix( + [['-2.0', '1.0'], + ['1.5', '-0.5']]) + >>> A * A**-1 + matrix( + [['1.0', '1.0842021724855e-19'], + ['-2.16840434497101e-19', '1.0']]) + + + + Matrix transposition is straightforward:: + + >>> A = ones(2, 3) + >>> A + matrix( + [['1.0', '1.0', '1.0'], + ['1.0', '1.0', '1.0']]) + >>> A.T + matrix( + [['1.0', '1.0'], + ['1.0', '1.0'], + ['1.0', '1.0']]) + + Norms + ..... + + Sometimes you need to know how "large" a matrix or vector is. Due to their + multidimensional nature it's not possible to compare them, but there are + several functions to map a matrix or a vector to a positive real number, the + so called norms. + + For vectors the p-norm is intended, usually the 1-, the 2- and the oo-norm are + used. + + >>> x = matrix([-10, 2, 100]) + >>> norm(x, 1) + mpf('112.0') + >>> norm(x, 2) + mpf('100.5186549850325') + >>> norm(x, inf) + mpf('100.0') + + Please note that the 2-norm is the most used one, though it is more expensive + to calculate than the 1- or oo-norm. + + It is possible to generalize some vector norms to matrix norm:: + + >>> A = matrix([[1, -1000], [100, 50]]) + >>> mnorm(A, 1) + mpf('1050.0') + >>> mnorm(A, inf) + mpf('1001.0') + >>> mnorm(A, 'F') + mpf('1006.2310867787777') + + The last norm (the "Frobenius-norm") is an approximation for the 2-norm, which + is hard to calculate and not available. The Frobenius-norm lacks some + mathematical properties you might expect from a norm. + """ + + def __init__(self, *args, **kwargs): + self.__data = {} + # LU decompostion cache, this is useful when solving the same system + # multiple times, when calculating the inverse and when calculating the + # determinant + self._LU = None + if "force_type" in kwargs: + warnings.warn("The force_type argument was removed, it did not work" + " properly anyway. If you want to force floating-point or" + " interval computations, use the respective methods from `fp`" + " or `mp` instead, e.g., `fp.matrix()` or `iv.matrix()`." + " If you want to truncate values to integer, use .apply(int) instead.") + if isinstance(args[0], (list, tuple)): + if isinstance(args[0][0], (list, tuple)): + # interpret nested list as matrix + A = args[0] + self.__rows = len(A) + self.__cols = len(A[0]) + for i, row in enumerate(A): + for j, a in enumerate(row): + # note: this will call __setitem__ which will call self.ctx.convert() to convert the datatype. + self[i, j] = a + else: + # interpret list as row vector + v = args[0] + self.__rows = len(v) + self.__cols = 1 + for i, e in enumerate(v): + self[i, 0] = e + elif isinstance(args[0], int): + # create empty matrix of given dimensions + if len(args) == 1: + self.__rows = self.__cols = args[0] + else: + if not isinstance(args[1], int): + raise TypeError("expected int") + self.__rows = args[0] + self.__cols = args[1] + elif isinstance(args[0], _matrix): + A = args[0] + self.__rows = A._matrix__rows + self.__cols = A._matrix__cols + for i in xrange(A.__rows): + for j in xrange(A.__cols): + self[i, j] = A[i, j] + elif hasattr(args[0], 'tolist'): + A = self.ctx.matrix(args[0].tolist()) + self.__data = A._matrix__data + self.__rows = A._matrix__rows + self.__cols = A._matrix__cols + else: + raise TypeError('could not interpret given arguments') + + def apply(self, f): + """ + Return a copy of self with the function `f` applied elementwise. + """ + new = self.ctx.matrix(self.__rows, self.__cols) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[i,j] = f(self[i,j]) + return new + + def __nstr__(self, n=None, **kwargs): + # Build table of string representations of the elements + res = [] + # Track per-column max lengths for pretty alignment + maxlen = [0] * self.cols + for i in range(self.rows): + res.append([]) + for j in range(self.cols): + if n: + string = self.ctx.nstr(self[i,j], n, **kwargs) + else: + string = str(self[i,j]) + res[-1].append(string) + maxlen[j] = max(len(string), maxlen[j]) + # Patch strings together + for i, row in enumerate(res): + for j, elem in enumerate(row): + # Pad each element up to maxlen so the columns line up + row[j] = elem.rjust(maxlen[j]) + res[i] = "[" + colsep.join(row) + "]" + return rowsep.join(res) + + def __str__(self): + return self.__nstr__() + + def _toliststr(self, avoid_type=False): + """ + Create a list string from a matrix. + + If avoid_type: avoid multiple 'mpf's. + """ + # XXX: should be something like self.ctx._types + typ = self.ctx.mpf + s = '[' + for i in xrange(self.__rows): + s += '[' + for j in xrange(self.__cols): + if not avoid_type or not isinstance(self[i,j], typ): + a = repr(self[i,j]) + else: + a = "'" + str(self[i,j]) + "'" + s += a + ', ' + s = s[:-2] + s += '],\n ' + s = s[:-3] + s += ']' + return s + + def tolist(self): + """ + Convert the matrix to a nested list. + """ + return [[self[i,j] for j in range(self.__cols)] for i in range(self.__rows)] + + def __repr__(self): + if self.ctx.pretty: + return self.__str__() + s = 'matrix(\n' + s += self._toliststr(avoid_type=True) + ')' + return s + + def __get_element(self, key): + ''' + Fast extraction of the i,j element from the matrix + This function is for private use only because is unsafe: + 1. Does not check on the value of key it expects key to be a integer tuple (i,j) + 2. Does not check bounds + ''' + if key in self.__data: + return self.__data[key] + else: + return self.ctx.zero + + def __set_element(self, key, value): + ''' + Fast assignment of the i,j element in the matrix + This function is unsafe: + 1. Does not check on the value of key it expects key to be a integer tuple (i,j) + 2. Does not check bounds + 3. Does not check the value type + 4. Does not reset the LU cache + ''' + if value: # only store non-zeros + self.__data[key] = value + elif key in self.__data: + del self.__data[key] + + + def __getitem__(self, key): + ''' + Getitem function for mp matrix class with slice index enabled + it allows the following assingments + scalar to a slice of the matrix + B = A[:,2:6] + ''' + # Convert vector to matrix indexing + if isinstance(key, int) or isinstance(key,slice): + # only sufficent for vectors + if self.__rows == 1: + key = (0, key) + elif self.__cols == 1: + key = (key, 0) + else: + raise IndexError('insufficient indices for matrix') + + if isinstance(key[0],slice) or isinstance(key[1],slice): + + #Rows + if isinstance(key[0],slice): + #Check bounds + if (key[0].start is None or key[0].start >= 0) and \ + (key[0].stop is None or key[0].stop <= self.__rows+1): + # Generate indices + rows = xrange(*key[0].indices(self.__rows)) + else: + raise IndexError('Row index out of bounds') + else: + # Single row + rows = [key[0]] + + # Columns + if isinstance(key[1],slice): + # Check bounds + if (key[1].start is None or key[1].start >= 0) and \ + (key[1].stop is None or key[1].stop <= self.__cols+1): + # Generate indices + columns = xrange(*key[1].indices(self.__cols)) + else: + raise IndexError('Column index out of bounds') + + else: + # Single column + columns = [key[1]] + + # Create matrix slice + m = self.ctx.matrix(len(rows),len(columns)) + + # Assign elements to the output matrix + for i,x in enumerate(rows): + for j,y in enumerate(columns): + m.__set_element((i,j),self.__get_element((x,y))) + + return m + + else: + # single element extraction + if key[0] >= self.__rows or key[1] >= self.__cols: + raise IndexError('matrix index out of range') + if key in self.__data: + return self.__data[key] + else: + return self.ctx.zero + + def __setitem__(self, key, value): + # setitem function for mp matrix class with slice index enabled + # it allows the following assingments + # scalar to a slice of the matrix + # A[:,2:6] = 2.5 + # submatrix to matrix (the value matrix should be the same size as the slice size) + # A[3,:] = B where A is n x m and B is n x 1 + # Convert vector to matrix indexing + if isinstance(key, int) or isinstance(key,slice): + # only sufficent for vectors + if self.__rows == 1: + key = (0, key) + elif self.__cols == 1: + key = (key, 0) + else: + raise IndexError('insufficient indices for matrix') + # Slice indexing + if isinstance(key[0],slice) or isinstance(key[1],slice): + # Rows + if isinstance(key[0],slice): + # Check bounds + if (key[0].start is None or key[0].start >= 0) and \ + (key[0].stop is None or key[0].stop <= self.__rows+1): + # generate row indices + rows = xrange(*key[0].indices(self.__rows)) + else: + raise IndexError('Row index out of bounds') + else: + # Single row + rows = [key[0]] + # Columns + if isinstance(key[1],slice): + # Check bounds + if (key[1].start is None or key[1].start >= 0) and \ + (key[1].stop is None or key[1].stop <= self.__cols+1): + # Generate column indices + columns = xrange(*key[1].indices(self.__cols)) + else: + raise IndexError('Column index out of bounds') + else: + # Single column + columns = [key[1]] + # Assign slice with a scalar + if isinstance(value,self.ctx.matrix): + # Assign elements to matrix if input and output dimensions match + if len(rows) == value.rows and len(columns) == value.cols: + for i,x in enumerate(rows): + for j,y in enumerate(columns): + self.__set_element((x,y), value.__get_element((i,j))) + else: + raise ValueError('Dimensions do not match') + else: + # Assign slice with scalars + value = self.ctx.convert(value) + for i in rows: + for j in columns: + self.__set_element((i,j), value) + else: + # Single element assingment + # Check bounds + if key[0] >= self.__rows or key[1] >= self.__cols: + raise IndexError('matrix index out of range') + # Convert and store value + value = self.ctx.convert(value) + if value: # only store non-zeros + self.__data[key] = value + elif key in self.__data: + del self.__data[key] + + if self._LU: + self._LU = None + return + + def __iter__(self): + for i in xrange(self.__rows): + for j in xrange(self.__cols): + yield self[i,j] + + def __mul__(self, other): + if isinstance(other, self.ctx.matrix): + # dot multiplication + if self.__cols != other.__rows: + raise ValueError('dimensions not compatible for multiplication') + new = self.ctx.matrix(self.__rows, other.__cols) + self_zero = self.ctx.zero + self_get = self.__data.get + other_zero = other.ctx.zero + other_get = other.__data.get + for i in xrange(self.__rows): + for j in xrange(other.__cols): + new[i, j] = self.ctx.fdot((self_get((i,k), self_zero), other_get((k,j), other_zero)) + for k in xrange(other.__rows)) + return new + else: + # try scalar multiplication + new = self.ctx.matrix(self.__rows, self.__cols) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[i, j] = other * self[i, j] + return new + + def __matmul__(self, other): + return self.__mul__(other) + + def __rmul__(self, other): + # assume other is scalar and thus commutative + if isinstance(other, self.ctx.matrix): + raise TypeError("other should not be type of ctx.matrix") + return self.__mul__(other) + + def __pow__(self, other): + # avoid cyclic import problems + #from linalg import inverse + if not isinstance(other, int): + raise ValueError('only integer exponents are supported') + if not self.__rows == self.__cols: + raise ValueError('only powers of square matrices are defined') + n = other + if n == 0: + return self.ctx.eye(self.__rows) + if n < 0: + n = -n + neg = True + else: + neg = False + i = n + y = 1 + z = self.copy() + while i != 0: + if i % 2 == 1: + y = y * z + z = z*z + i = i // 2 + if neg: + y = self.ctx.inverse(y) + return y + + def __div__(self, other): + # assume other is scalar and do element-wise divison + assert not isinstance(other, self.ctx.matrix) + new = self.ctx.matrix(self.__rows, self.__cols) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[i,j] = self[i,j] / other + return new + + __truediv__ = __div__ + + def __add__(self, other): + if isinstance(other, self.ctx.matrix): + if not (self.__rows == other.__rows and self.__cols == other.__cols): + raise ValueError('incompatible dimensions for addition') + new = self.ctx.matrix(self.__rows, self.__cols) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[i,j] = self[i,j] + other[i,j] + return new + else: + # assume other is scalar and add element-wise + new = self.ctx.matrix(self.__rows, self.__cols) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[i,j] += self[i,j] + other + return new + + def __radd__(self, other): + return self.__add__(other) + + def __sub__(self, other): + if isinstance(other, self.ctx.matrix) and not (self.__rows == other.__rows + and self.__cols == other.__cols): + raise ValueError('incompatible dimensions for subtraction') + return self.__add__(other * (-1)) + + def __pos__(self): + """ + +M returns a copy of M, rounded to current working precision. + """ + return (+1) * self + + def __neg__(self): + return (-1) * self + + def __rsub__(self, other): + return -self + other + + def __eq__(self, other): + return self.__rows == other.__rows and self.__cols == other.__cols \ + and self.__data == other.__data + + def __len__(self): + if self.rows == 1: + return self.cols + elif self.cols == 1: + return self.rows + else: + return self.rows # do it like numpy + + def __getrows(self): + return self.__rows + + def __setrows(self, value): + for key in self.__data.copy(): + if key[0] >= value: + del self.__data[key] + self.__rows = value + + rows = property(__getrows, __setrows, doc='number of rows') + + def __getcols(self): + return self.__cols + + def __setcols(self, value): + for key in self.__data.copy(): + if key[1] >= value: + del self.__data[key] + self.__cols = value + + cols = property(__getcols, __setcols, doc='number of columns') + + def transpose(self): + new = self.ctx.matrix(self.__cols, self.__rows) + for i in xrange(self.__rows): + for j in xrange(self.__cols): + new[j,i] = self[i,j] + return new + + T = property(transpose) + + def conjugate(self): + return self.apply(self.ctx.conj) + + def transpose_conj(self): + return self.conjugate().transpose() + + H = property(transpose_conj) + + def copy(self): + new = self.ctx.matrix(self.__rows, self.__cols) + new.__data = self.__data.copy() + return new + + __copy__ = copy + + def column(self, n): + m = self.ctx.matrix(self.rows, 1) + for i in range(self.rows): + m[i] = self[i,n] + return m + +class MatrixMethods(object): + + def __init__(ctx): + # XXX: subclass + ctx.matrix = type('matrix', (_matrix,), {}) + ctx.matrix.ctx = ctx + ctx.matrix.convert = ctx.convert + + def eye(ctx, n, **kwargs): + """ + Create square identity matrix n x n. + """ + A = ctx.matrix(n, **kwargs) + for i in xrange(n): + A[i,i] = 1 + return A + + def diag(ctx, diagonal, **kwargs): + """ + Create square diagonal matrix using given list. + + Example: + >>> from mpmath import diag, mp + >>> mp.pretty = False + >>> diag([1, 2, 3]) + matrix( + [['1.0', '0.0', '0.0'], + ['0.0', '2.0', '0.0'], + ['0.0', '0.0', '3.0']]) + """ + A = ctx.matrix(len(diagonal), **kwargs) + for i in xrange(len(diagonal)): + A[i,i] = diagonal[i] + return A + + def zeros(ctx, *args, **kwargs): + """ + Create matrix m x n filled with zeros. + One given dimension will create square matrix n x n. + + Example: + >>> from mpmath import zeros, mp + >>> mp.pretty = False + >>> zeros(2) + matrix( + [['0.0', '0.0'], + ['0.0', '0.0']]) + """ + if len(args) == 1: + m = n = args[0] + elif len(args) == 2: + m = args[0] + n = args[1] + else: + raise TypeError('zeros expected at most 2 arguments, got %i' % len(args)) + A = ctx.matrix(m, n, **kwargs) + for i in xrange(m): + for j in xrange(n): + A[i,j] = 0 + return A + + def ones(ctx, *args, **kwargs): + """ + Create matrix m x n filled with ones. + One given dimension will create square matrix n x n. + + Example: + >>> from mpmath import ones, mp + >>> mp.pretty = False + >>> ones(2) + matrix( + [['1.0', '1.0'], + ['1.0', '1.0']]) + """ + if len(args) == 1: + m = n = args[0] + elif len(args) == 2: + m = args[0] + n = args[1] + else: + raise TypeError('ones expected at most 2 arguments, got %i' % len(args)) + A = ctx.matrix(m, n, **kwargs) + for i in xrange(m): + for j in xrange(n): + A[i,j] = 1 + return A + + def hilbert(ctx, m, n=None): + """ + Create (pseudo) hilbert matrix m x n. + One given dimension will create hilbert matrix n x n. + + The matrix is very ill-conditioned and symmetric, positive definite if + square. + """ + if n is None: + n = m + A = ctx.matrix(m, n) + for i in xrange(m): + for j in xrange(n): + A[i,j] = ctx.one / (i + j + 1) + return A + + def randmatrix(ctx, m, n=None, min=0, max=1, **kwargs): + """ + Create a random m x n matrix. + + All values are >= min and >> from mpmath import randmatrix + >>> randmatrix(2) # doctest:+SKIP + matrix( + [['0.53491598236191806', '0.57195669543302752'], + ['0.85589992269513615', '0.82444367501382143']]) + """ + if not n: + n = m + A = ctx.matrix(m, n, **kwargs) + for i in xrange(m): + for j in xrange(n): + A[i,j] = ctx.rand() * (max - min) + min + return A + + def swap_row(ctx, A, i, j): + """ + Swap row i with row j. + """ + if i == j: + return + if isinstance(A, ctx.matrix): + for k in xrange(A.cols): + A[i,k], A[j,k] = A[j,k], A[i,k] + elif isinstance(A, list): + A[i], A[j] = A[j], A[i] + else: + raise TypeError('could not interpret type') + + def extend(ctx, A, b): + """ + Extend matrix A with column b and return result. + """ + if not isinstance(A, ctx.matrix): + raise TypeError("A should be a type of ctx.matrix") + if A.rows != len(b): + raise ValueError("Value should be equal to len(b)") + A = A.copy() + A.cols += 1 + for i in xrange(A.rows): + A[i, A.cols-1] = b[i] + return A + + def norm(ctx, x, p=2): + r""" + Gives the entrywise `p`-norm of an iterable *x*, i.e. the vector norm + `\left(\sum_k |x_k|^p\right)^{1/p}`, for any given `1 \le p \le \infty`. + + Special cases: + + If *x* is not iterable, this just returns ``absmax(x)``. + + ``p=1`` gives the sum of absolute values. + + ``p=2`` is the standard Euclidean vector norm. + + ``p=inf`` gives the magnitude of the largest element. + + For *x* a matrix, ``p=2`` is the Frobenius norm. + For operator matrix norms, use :func:`~mpmath.mnorm` instead. + + You can use the string 'inf' as well as float('inf') or mpf('inf') + to specify the infinity norm. + + **Examples** + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = False + >>> x = matrix([-10, 2, 100]) + >>> norm(x, 1) + mpf('112.0') + >>> norm(x, 2) + mpf('100.5186549850325') + >>> norm(x, inf) + mpf('100.0') + + """ + try: + iter(x) + except TypeError: + return ctx.absmax(x) + if type(p) is not int: + p = ctx.convert(p) + if p == ctx.inf: + return max(ctx.absmax(i) for i in x) + elif p == 1: + return ctx.fsum(x, absolute=1) + elif p == 2: + return ctx.sqrt(ctx.fsum(x, absolute=1, squared=1)) + elif p > 1: + return ctx.nthroot(ctx.fsum(abs(i)**p for i in x), p) + else: + raise ValueError('p has to be >= 1') + + def mnorm(ctx, A, p=1): + r""" + Gives the matrix (operator) `p`-norm of A. Currently ``p=1`` and ``p=inf`` + are supported: + + ``p=1`` gives the 1-norm (maximal column sum) + + ``p=inf`` gives the `\infty`-norm (maximal row sum). + You can use the string 'inf' as well as float('inf') or mpf('inf') + + ``p=2`` (not implemented) for a square matrix is the usual spectral + matrix norm, i.e. the largest singular value. + + ``p='f'`` (or 'F', 'fro', 'Frobenius, 'frobenius') gives the + Frobenius norm, which is the elementwise 2-norm. The Frobenius norm is an + approximation of the spectral norm and satisfies + + .. math :: + + \frac{1}{\sqrt{\mathrm{rank}(A)}} \|A\|_F \le \|A\|_2 \le \|A\|_F + + The Frobenius norm lacks some mathematical properties that might + be expected of a norm. + + For general elementwise `p`-norms, use :func:`~mpmath.norm` instead. + + **Examples** + + >>> from mpmath import * + >>> mp.dps = 15; mp.pretty = False + >>> A = matrix([[1, -1000], [100, 50]]) + >>> mnorm(A, 1) + mpf('1050.0') + >>> mnorm(A, inf) + mpf('1001.0') + >>> mnorm(A, 'F') + mpf('1006.2310867787777') + + """ + A = ctx.matrix(A) + if type(p) is not int: + if type(p) is str and 'frobenius'.startswith(p.lower()): + return ctx.norm(A, 2) + p = ctx.convert(p) + m, n = A.rows, A.cols + if p == 1: + return max(ctx.fsum((A[i,j] for i in xrange(m)), absolute=1) for j in xrange(n)) + elif p == ctx.inf: + return max(ctx.fsum((A[i,j] for j in xrange(n)), absolute=1) for i in xrange(m)) + else: + raise NotImplementedError("matrix p-norm for arbitrary p") + +if __name__ == '__main__': + import doctest + doctest.testmod() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..526ca48e4bc936b00dc167e834fc25cccf3538b3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8ff30982e43b5f3841a27f842465ae57888dbde Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h new file mode 100644 index 0000000000000000000000000000000000000000..36b1210242495c6d4e0fa26e62583832a65004f9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h @@ -0,0 +1,1853 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ +#pragma once + +#ifndef CUBLASAPI +#ifdef __CUDACC__ +#define CUBLASAPI __host__ __device__ +#else +#define CUBLASAPI +#endif +#endif + +#include + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** Opaque structure holding CUBLASLT context + */ +typedef struct cublasLtContext* cublasLtHandle_t; + +cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t* lightHandle); + +cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle); + +const char* CUBLASWINAPI cublasLtGetStatusName(cublasStatus_t status); + +const char* CUBLASWINAPI cublasLtGetStatusString(cublasStatus_t status); + +size_t CUBLASWINAPI cublasLtGetVersion(void); + +size_t CUBLASWINAPI cublasLtGetCudartVersion(void); + +cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type, int* value); + +cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheGetCapacity(size_t* capacity); +cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheSetCapacity(size_t capacity); + +/** Semi-opaque descriptor for matrix memory layout + */ +typedef struct { + uint64_t data[8]; +} cublasLtMatrixLayoutOpaque_t; + +/** Opaque descriptor for matrix memory layout + */ +typedef cublasLtMatrixLayoutOpaque_t* cublasLtMatrixLayout_t; + +/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes) + * + * This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save + * on selecting the right configuration again. + */ +typedef struct { + uint64_t data[8]; +} cublasLtMatmulAlgo_t; + +/** Semi-opaque descriptor for cublasLtMatmul() operation details + */ +typedef struct { + uint64_t data[23]; +} cublasLtMatmulDescOpaque_t; + +/** Opaque descriptor for cublasLtMatmul() operation details + */ +typedef cublasLtMatmulDescOpaque_t* cublasLtMatmulDesc_t; + +/** Semi-opaque descriptor for cublasLtMatrixTransform() operation details + */ +typedef struct { + uint64_t data[8]; +} cublasLtMatrixTransformDescOpaque_t; + +/** Opaque descriptor for cublasLtMatrixTransform() operation details + */ +typedef cublasLtMatrixTransformDescOpaque_t* cublasLtMatrixTransformDesc_t; + +/** Semi-opaque descriptor for cublasLtMatmulPreference() operation details + */ +typedef struct { + uint64_t data[10]; +} cublasLtMatmulPreferenceOpaque_t; + +/** Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration + */ +typedef cublasLtMatmulPreferenceOpaque_t* cublasLtMatmulPreference_t; + +/** Tile size (in C/D matrix Rows x Cols) + * + * General order of tile IDs is sorted by size first and by first dimension second. + */ +typedef enum { + CUBLASLT_MATMUL_TILE_UNDEFINED = 0, + CUBLASLT_MATMUL_TILE_8x8 = 1, + CUBLASLT_MATMUL_TILE_8x16 = 2, + CUBLASLT_MATMUL_TILE_16x8 = 3, + CUBLASLT_MATMUL_TILE_8x32 = 4, + CUBLASLT_MATMUL_TILE_16x16 = 5, + CUBLASLT_MATMUL_TILE_32x8 = 6, + CUBLASLT_MATMUL_TILE_8x64 = 7, + CUBLASLT_MATMUL_TILE_16x32 = 8, + CUBLASLT_MATMUL_TILE_32x16 = 9, + CUBLASLT_MATMUL_TILE_64x8 = 10, + CUBLASLT_MATMUL_TILE_32x32 = 11, + CUBLASLT_MATMUL_TILE_32x64 = 12, + CUBLASLT_MATMUL_TILE_64x32 = 13, + CUBLASLT_MATMUL_TILE_32x128 = 14, + CUBLASLT_MATMUL_TILE_64x64 = 15, + CUBLASLT_MATMUL_TILE_128x32 = 16, + CUBLASLT_MATMUL_TILE_64x128 = 17, + CUBLASLT_MATMUL_TILE_128x64 = 18, + CUBLASLT_MATMUL_TILE_64x256 = 19, + CUBLASLT_MATMUL_TILE_128x128 = 20, + CUBLASLT_MATMUL_TILE_256x64 = 21, + CUBLASLT_MATMUL_TILE_64x512 = 22, + CUBLASLT_MATMUL_TILE_128x256 = 23, + CUBLASLT_MATMUL_TILE_256x128 = 24, + CUBLASLT_MATMUL_TILE_512x64 = 25, + CUBLASLT_MATMUL_TILE_64x96 = 26, + CUBLASLT_MATMUL_TILE_96x64 = 27, + CUBLASLT_MATMUL_TILE_96x128 = 28, + CUBLASLT_MATMUL_TILE_128x160 = 29, + CUBLASLT_MATMUL_TILE_160x128 = 30, + CUBLASLT_MATMUL_TILE_192x128 = 31, + CUBLASLT_MATMUL_TILE_128x192 = 32, + CUBLASLT_MATMUL_TILE_128x96 = 33, + CUBLASLT_MATMUL_TILE_END +} cublasLtMatmulTile_t; + +/** Size and number of stages in which elements are read into shared memory + * + * General order of stages IDs is sorted by stage size first and by number of stages second. + */ +typedef enum { + CUBLASLT_MATMUL_STAGES_UNDEFINED = 0, + CUBLASLT_MATMUL_STAGES_16x1 = 1, + CUBLASLT_MATMUL_STAGES_16x2 = 2, + CUBLASLT_MATMUL_STAGES_16x3 = 3, + CUBLASLT_MATMUL_STAGES_16x4 = 4, + CUBLASLT_MATMUL_STAGES_16x5 = 5, + CUBLASLT_MATMUL_STAGES_16x6 = 6, + CUBLASLT_MATMUL_STAGES_32x1 = 7, + CUBLASLT_MATMUL_STAGES_32x2 = 8, + CUBLASLT_MATMUL_STAGES_32x3 = 9, + CUBLASLT_MATMUL_STAGES_32x4 = 10, + CUBLASLT_MATMUL_STAGES_32x5 = 11, + CUBLASLT_MATMUL_STAGES_32x6 = 12, + CUBLASLT_MATMUL_STAGES_64x1 = 13, + CUBLASLT_MATMUL_STAGES_64x2 = 14, + CUBLASLT_MATMUL_STAGES_64x3 = 15, + CUBLASLT_MATMUL_STAGES_64x4 = 16, + CUBLASLT_MATMUL_STAGES_64x5 = 17, + CUBLASLT_MATMUL_STAGES_64x6 = 18, + CUBLASLT_MATMUL_STAGES_128x1 = 19, + CUBLASLT_MATMUL_STAGES_128x2 = 20, + CUBLASLT_MATMUL_STAGES_128x3 = 21, + CUBLASLT_MATMUL_STAGES_128x4 = 22, + CUBLASLT_MATMUL_STAGES_128x5 = 23, + CUBLASLT_MATMUL_STAGES_128x6 = 24, + CUBLASLT_MATMUL_STAGES_32x10 = 25, + CUBLASLT_MATMUL_STAGES_8x4 = 26, + CUBLASLT_MATMUL_STAGES_16x10 = 27, + CUBLASLT_MATMUL_STAGES_8x5 = 28, + CUBLASLT_MATMUL_STAGES_16x80 = 29, + CUBLASLT_MATMUL_STAGES_64x80 = 30, + CUBLASLT_MATMUL_STAGES_8x3 = 31, + CUBLASLT_MATMUL_STAGES_8xAUTO = 32, + CUBLASLT_MATMUL_STAGES_16xAUTO = 33, + CUBLASLT_MATMUL_STAGES_32xAUTO = 34, + CUBLASLT_MATMUL_STAGES_64xAUTO = 35, + CUBLASLT_MATMUL_STAGES_128xAUTO = 36, + CUBLASLT_MATMUL_STAGES_END +} cublasLtMatmulStages_t; + +/** Thread Block Cluster size + * + * Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time. + */ +typedef enum { + /** Let library pick cluster shape automatically */ + CUBLASLT_CLUSTER_SHAPE_AUTO = 0, + CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2, + CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3, + CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4, + CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5, + CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6, + CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7, + CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8, + CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9, + CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10, + CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11, + CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12, + CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13, + CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14, + CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15, + CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16, + CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17, + CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18, + CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19, + CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20, + CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21, + CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22, + CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23, + CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24, + CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25, + CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26, + CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27, + CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28, + CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29, + CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30, + CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31, + CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32, + CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33, + CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34, + CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35, + CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36, + CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37, + CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38, + CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39, + CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40, + CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41, + CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42, + CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43, + CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44, + CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45, + CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46, + CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47, + CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48, + CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49, + CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50, + CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51, + CUBLASLT_CLUSTER_SHAPE_END +} cublasLtClusterShape_t; + +/** Inner size of the kernel + * + * Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle + * effects. + * + */ +typedef enum { + CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0, + CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1, + CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2, + CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3, + CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4, + CUBLASLT_MATMUL_INNER_SHAPE_END +} cublasLtMatmulInnerShape_t; + +/** Pointer mode to use for alpha/beta */ +typedef enum { + /** matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory */ + CUBLASLT_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST, + /** matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory */ + CUBLASLT_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE, + /** pointer targets an array in device memory */ + CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2, + /** alpha pointer targets an array in device memory, beta is zero. Note: + CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0. */ + CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3, + /** alpha pointer targets an array in device memory, beta is a single value in host memory. */ + CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4, +} cublasLtPointerMode_t; + +/** Mask to define and query pointer mode capability */ +typedef enum { + /** no initial filtering is performed when querying pointer mode capabilities, will use gemm pointer mode defined in + operation description **/ + CUBLASLT_POINTER_MODE_MASK_NO_FILTERING = 0, + /** see CUBLASLT_POINTER_MODE_HOST */ + CUBLASLT_POINTER_MODE_MASK_HOST = 1, + /** see CUBLASLT_POINTER_MODE_DEVICE */ + CUBLASLT_POINTER_MODE_MASK_DEVICE = 2, + /** see CUBLASLT_POINTER_MODE_DEVICE_VECTOR */ + CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4, + /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO */ + CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8, + /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST */ + CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16, +} cublasLtPointerModeMask_t; + +/** Implementation details that may affect numerical behavior of algorithms. */ +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA (0x01ull << 0) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA (0x02ull << 0) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA (0x04ull << 0) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA (0x08ull << 0) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK (0xfeull << 0) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK (0xffull << 0) + +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F (0x01ull << 8) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F (0x02ull << 8) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F (0x04ull << 8) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I (0x08ull << 8) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK (0xffull << 8) + +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F (0x01ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF (0x02ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32 (0x04ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F (0x08ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F (0x10ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I (0x20ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3 (0x40ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2 (0x80ull << 16) +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK (0xffull << 16) + +#define CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN (0x01ull << 32) +typedef uint64_t cublasLtNumericalImplFlags_t; + +/** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C). + * + * \retval CUBLAS_STATUS_NOT_INITIALIZED if cuBLASLt handle has not been initialized + * \retval CUBLAS_STATUS_INVALID_VALUE if parameters are in conflict or in an impossible configuration; e.g. + * when workspaceSizeInBytes is less than workspace required by configured + * algo + * \retval CUBLAS_STATUS_NOT_SUPPORTED if current implementation on selected device doesn't support configured + * operation + * \retval CUBLAS_STATUS_ARCH_MISMATCH if configured operation cannot be run using selected device + * \retval CUBLAS_STATUS_EXECUTION_FAILED if cuda reported execution error from the device + * \retval CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmul(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, + const void* alpha, /* host or device pointer */ + const void* A, + cublasLtMatrixLayout_t Adesc, + const void* B, + cublasLtMatrixLayout_t Bdesc, + const void* beta, /* host or device pointer */ + const void* C, + cublasLtMatrixLayout_t Cdesc, + void* D, + cublasLtMatrixLayout_t Ddesc, + const cublasLtMatmulAlgo_t* algo, + void* workspace, + size_t workspaceSizeInBytes, + cudaStream_t stream); + +/** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B)) + * + * Can be used to change memory order of data or to scale and shift the values. + * + * \retval CUBLAS_STATUS_NOT_INITIALIZED if cuBLASLt handle has not been initialized + * \retval CUBLAS_STATUS_INVALID_VALUE if parameters are in conflict or in an impossible configuration; e.g. + * when A is not NULL, but Adesc is NULL + * \retval CUBLAS_STATUS_NOT_SUPPORTED if current implementation on selected device doesn't support configured + * operation + * \retval CUBLAS_STATUS_ARCH_MISMATCH if configured operation cannot be run using selected device + * \retval CUBLAS_STATUS_EXECUTION_FAILED if cuda reported execution error from the device + * \retval CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(cublasLtHandle_t lightHandle, + cublasLtMatrixTransformDesc_t transformDesc, + const void* alpha, /* host or device pointer */ + const void* A, + cublasLtMatrixLayout_t Adesc, + const void* beta, /* host or device pointer */ + const void* B, + cublasLtMatrixLayout_t Bdesc, + void* C, + cublasLtMatrixLayout_t Cdesc, + cudaStream_t stream); + +/* ---------------------------------------------------------------------------------------*/ +/* Helper functions for cublasLtMatrixLayout_t */ +/* ---------------------------------------------------------------------------------------*/ + +/** Enum for data ordering */ +typedef enum { + /** Column-major + * + * Leading dimension is the stride (in elements) to the beginning of next column in memory. + */ + CUBLASLT_ORDER_COL = 0, + /** Row major + * + * Leading dimension is the stride (in elements) to the beginning of next row in memory. + */ + CUBLASLT_ORDER_ROW = 1, + /** Column-major ordered tiles of 32 columns. + * + * Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33 + * columns and 2 rows, ld must be at least (32) * 2 = 64. + */ + CUBLASLT_ORDER_COL32 = 2, + /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved + * inner tiles of 4 columns within 4 even or odd rows in an alternating pattern. + * + * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next + * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256. + */ + CUBLASLT_ORDER_COL4_4R2_8C = 3, + /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows. + * Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col. + * + * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next + * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024. + */ + CUBLASLT_ORDER_COL32_2R_4R4 = 4, + +} cublasLtOrder_t; + +/** Attributes of memory layout */ +typedef enum { + /** Data type, see cudaDataType. + * + * uint32_t + */ + CUBLASLT_MATRIX_LAYOUT_TYPE = 0, + + /** Memory order of the data, see cublasLtOrder_t. + * + * int32_t, default: CUBLASLT_ORDER_COL + */ + CUBLASLT_MATRIX_LAYOUT_ORDER = 1, + + /** Number of rows. + * + * Usually only values that can be expressed as int32_t are supported. + * + * uint64_t + */ + CUBLASLT_MATRIX_LAYOUT_ROWS = 2, + + /** Number of columns. + * + * Usually only values that can be expressed as int32_t are supported. + * + * uint64_t + */ + CUBLASLT_MATRIX_LAYOUT_COLS = 3, + + /** Matrix leading dimension. + * + * For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for + * other memory orders see documentation for cublasLtOrder_t values. + * + * Currently only non-negative values are supported, must be large enough so that matrix memory locations are not + * overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL). + * + * int64_t; + */ + CUBLASLT_MATRIX_LAYOUT_LD = 4, + + /** Number of matmul operations to perform in the batch. + * + * See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT + * + * int32_t, default: 1 + */ + CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5, + + /** Stride (in elements) to the next matrix for strided batch operation. + * + * When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride + * is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F, + * offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices + * is a 2B (16bit) floating point type). + * + * NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix + * as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride + * value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B + * each). This behavior is expected to be corrected in the next major cuBLAS version. + * + * int64_t, default: 0 + */ + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6, + + /** Stride (in bytes) to the imaginary plane for planar complex layout. + * + * int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved + * in memory in each element) + */ + CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7, +} cublasLtMatrixLayoutAttribute_t; + +/** Internal. Do not use directly. + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal( // + cublasLtMatrixLayout_t matLayout, + size_t size, + cudaDataType type, + uint64_t rows, + uint64_t cols, + int64_t ld); + +/** Initialize matrix layout descriptor in pre-allocated space. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if size of the pre-allocated space is insufficient + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +static inline cublasStatus_t cublasLtMatrixLayoutInit( + cublasLtMatrixLayout_t matLayout, cudaDataType type, uint64_t rows, uint64_t cols, int64_t ld) { + return cublasLtMatrixLayoutInit_internal(matLayout, sizeof(*matLayout), type, rows, cols, ld); +} + +/** Create new matrix layout descriptor. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if memory could not be allocated + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate( // + cublasLtMatrixLayout_t* matLayout, + cudaDataType type, + uint64_t rows, + uint64_t cols, + int64_t ld); + +/** Destroy matrix layout descriptor. + * + * \retval CUBLAS_STATUS_SUCCESS if operation was successful + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout); + +/** Set matrix layout descriptor attribute. + * + * \param[in] matLayout The descriptor + * \param[in] attr The attribute + * \param[in] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * + * \retval CUBLAS_STATUS_INVALID_VALUE if buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute( // + cublasLtMatrixLayout_t matLayout, + cublasLtMatrixLayoutAttribute_t attr, + const void* buf, + size_t sizeInBytes); + +/** Get matrix layout descriptor attribute. + * + * \param[in] matLayout The descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of + * bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute( // + cublasLtMatrixLayout_t matLayout, + cublasLtMatrixLayoutAttribute_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/* ---------------------------------------------------------------------------------------*/ +/* Helper functions for cublasLtMatmulDesc_t */ +/* ---------------------------------------------------------------------------------------*/ + +/** Matmul descriptor attributes to define details of the operation. */ +typedef enum { + /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the + * accumulator during matrix multiplication. + * + * int32_t + */ + CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0, + + /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are + * typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix + * D before being stored in memory. + * + * int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE + */ + CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1, + + /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use, + * alpha/beta vector lenghts must match number of output matrix rows. + * + * int32_t, default: CUBLASLT_POINTER_MODE_HOST + */ + CUBLASLT_MATMUL_DESC_POINTER_MODE = 2, + + /** Transform of matrix A, see cublasOperation_t. + * + * int32_t, default: CUBLAS_OP_N + */ + CUBLASLT_MATMUL_DESC_TRANSA = 3, + + /** Transform of matrix B, see cublasOperation_t. + * + * int32_t, default: CUBLAS_OP_N + */ + CUBLASLT_MATMUL_DESC_TRANSB = 4, + + /** Transform of matrix C, see cublasOperation_t. + * + * Currently only CUBLAS_OP_N is supported. + * + * int32_t, default: CUBLAS_OP_N + */ + CUBLASLT_MATMUL_DESC_TRANSC = 5, + + /** Matrix fill mode, see cublasFillMode_t. + * + * int32_t, default: CUBLAS_FILL_MODE_FULL + */ + CUBLASLT_MATMUL_DESC_FILL_MODE = 6, + + /** Epilogue function, see cublasLtEpilogue_t. + * + * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT + */ + CUBLASLT_MATMUL_DESC_EPILOGUE = 7, + + /** Bias or bias gradient vector pointer in the device memory. + * + * Bias case. See CUBLASLT_EPILOGUE_BIAS. + * For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE. + * + * Bias vector length must match matrix D rows count. + * + * Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD. + * Bias gradient vector elements are the same type as the output elements + * (Ctype) with the exception of IMMA kernels (see above). + * + * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic() + * depend on its value to determine expected pointer alignment. + * + * Bias case: const void *, default: NULL + * Bias gradient case: void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8, + + /** Batch stride for bias or bias gradient vector. + * + * Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. + * + * int64_t, default: 0 + */ + CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10, + + /** Pointer for epilogue auxiliary buffer. + * + * - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX + * or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used. + * - Input vector for ReLu bit-mask in backward pass when + * CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used. + * + * - Output of GELU input matrix in forward pass when + * CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used. + * - Input of GELU input matrix for backward pass when + * CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used. + * + * For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE. + * + * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic() + * depend on its value to determine expected pointer alignment. + * + * Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute. + * + * Forward pass: void *, default: NULL + * Backward pass: const void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11, + + /** Leading dimension for epilogue auxiliary buffer. + * + * - ReLu bit-mask matrix leading dimension in elements (i.e. bits) + * when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is + * used. Must be divisible by 128 and be no less than the number of rows in the output matrix. + * + * - GELU input matrix leading dimension in elements + * when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used. + * Must be divisible by 8 and be no less than the number of rows in the output matrix. + * + * int64_t, default: 0 + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12, + + /** Batch stride for epilogue auxiliary buffer. + * + * - ReLu bit-mask matrix batch stride in elements (i.e. bits) + * when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is + * used. Must be divisible by 128. + * + * - GELU input matrix batch stride in elements + * when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used. + * Must be divisible by 8. + * + * int64_t, default: 0 + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13, + + /** Batch stride for alpha vector. + * + * Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's + * CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then + * CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector. + * + * int64_t, default: 0 + */ + CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14, + + /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs + * when user expects a concurrent stream to be using some of the device resources. + * + * int32_t, default: 0 - use the number reported by the device. + */ + CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15, + + /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range. + * + * The scaling factor value must have the same type as the compute type. + * + * If not specified, or set to NULL, the scaling factor is assumed to be 1. + * + * If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul() + * will return CUBLAS_INVALID_VALUE. + * + * const void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17, + + /** Device pointer to the scale factor value to convert data in matrix B to compute data type range. + * + * The scaling factor value must have the same type as the compute type. + * + * If not specified, or set to NULL, the scaling factor is assumed to be 1. + * + * If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul() + * will return CUBLAS_INVALID_VALUE. + * + * const void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18, + + /** Device pointer to the scale factor value to convert data in matrix C to compute data type range. + * + * The scaling factor value must have the same type as the compute type. + * + * If not specified, or set to NULL, the scaling factor is assumed to be 1. + * + * If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul() + * will return CUBLAS_INVALID_VALUE. + * + * const void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19, + + /** Device pointer to the scale factor value to convert data in matrix D to compute data type range. + * + * The scaling factor value must have the same type as the compute type. + * + * If not specified, or set to NULL, the scaling factor is assumed to be 1. + * + * If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul() + * will return CUBLAS_INVALID_VALUE. + * + * const void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20, + + /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the + * output matrix. + * + * The computed value has the same type as the compute type. + * + * If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix + * data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE. + * + * void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21, + + /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + * + * If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details + * below. + * + * ReLu uses a bit-mask. + * + * GELU input matrix elements type is the same as the type of elements of + * the output matrix with some exceptions, see details below. + * + * For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some + * restrictions. See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details. + * + * If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul() + * will return CUBLAS_INVALID_VALUE. + * + * int32_t based on cudaDataType, default: -1 + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22, + + /** Device pointer to the scaling factor value to convert results from compute type data range to storage + * data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + * + * The scaling factor value must have the same type as the compute type. + * + * If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data, + * scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE. + * + * void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23, + + /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the + * buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + * + * The computed value has the same type as the compute type. + * + * If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix + * data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE. + * + * void *, default: NULL + */ + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24, + + /** Flag for managing fp8 fast accumulation mode. + * When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results + * will not periodically be promoted to a higher precision. + * + * int8_t, default: 0 - fast accumulation mode is disabled. + */ + CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25, + + /** Type of bias or bias gradient vector in the device memory. + * + * Bias case: see CUBLASLT_EPILOGUE_BIAS. + * + * Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions: + * - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements + * are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F) + * - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See + * https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details. + * + * int32_t based on cudaDataType, default: -1 + */ + CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26, +} cublasLtMatmulDescAttributes_t; + +/** Internal. Do not use directly. + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal( // + cublasLtMatmulDesc_t matmulDesc, + size_t size, + cublasComputeType_t computeType, + cudaDataType_t scaleType); + +/** Initialize matmul operation descriptor in pre-allocated space. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if size of the pre-allocated space is insufficient + * \retval CUBLAS_STATUS_SUCCESS if desciptor was initialized successfully + */ +static inline cublasStatus_t cublasLtMatmulDescInit( // + cublasLtMatmulDesc_t matmulDesc, + cublasComputeType_t computeType, + cudaDataType_t scaleType) { + return cublasLtMatmulDescInit_internal(matmulDesc, sizeof(*matmulDesc), computeType, scaleType); +} + +/** Create new matmul operation descriptor. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if memory could not be allocated + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(cublasLtMatmulDesc_t* matmulDesc, + cublasComputeType_t computeType, + cudaDataType_t scaleType); + +/** Destroy matmul operation descriptor. + * + * \retval CUBLAS_STATUS_SUCCESS if operation was successful + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc); + +/** Set matmul operation descriptor attribute. + * + * \param[in] matmulDesc The descriptor + * \param[in] attr The attribute + * \param[in] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * + * \retval CUBLAS_STATUS_INVALID_VALUE if buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute( // + cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, + const void* buf, + size_t sizeInBytes); + +/** Get matmul operation descriptor attribute. + * + * \param[in] matmulDesc The descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of + * bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute( // + cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/* ---------------------------------------------------------------------------------------*/ +/* Helper functions for cublasLtMatrixTransformDesc_t */ +/* ---------------------------------------------------------------------------------------*/ + +/** Matrix transform descriptor attributes to define details of the operation. + */ +typedef enum { + /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then + * converted to output type to store in memory. + * + * int32_t + */ + CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, + + /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. + * + * int32_t, default: CUBLASLT_POINTER_MODE_HOST + */ + CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, + + /** Transform of matrix A, see cublasOperation_t. + * + * int32_t, default: CUBLAS_OP_N + */ + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, + + /** Transform of matrix B, see cublasOperation_t. + * + * int32_t, default: CUBLAS_OP_N + */ + CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB, +} cublasLtMatrixTransformDescAttributes_t; + +/** Internal. Do not use directly. + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc, + size_t size, + cudaDataType scaleType); + +/** Initialize matrix transform operation descriptor in pre-allocated space. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if size of the pre-allocated space is insufficient + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +static inline cublasStatus_t cublasLtMatrixTransformDescInit(cublasLtMatrixTransformDesc_t transformDesc, + cudaDataType scaleType) { + return cublasLtMatrixTransformDescInit_internal(transformDesc, sizeof(*transformDesc), scaleType); +} + +/** Create new matrix transform operation descriptor. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if memory could not be allocated + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t* transformDesc, + cudaDataType scaleType); + +/** Destroy matrix transform operation descriptor. + * + * \retval CUBLAS_STATUS_SUCCESS if operation was successful + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc); + +/** Set matrix transform operation descriptor attribute. + * + * \param[in] transformDesc The descriptor + * \param[in] attr The attribute + * \param[in] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * + * \retval CUBLAS_STATUS_INVALID_VALUE if buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute( // + cublasLtMatrixTransformDesc_t transformDesc, + cublasLtMatrixTransformDescAttributes_t attr, + const void* buf, + size_t sizeInBytes); + +/** Get matrix transform operation descriptor attribute. + * + * \param[in] transformDesc The descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number + * of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute( // + cublasLtMatrixTransformDesc_t transformDesc, + cublasLtMatrixTransformDescAttributes_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/** For computation with complex numbers, this enum allows to apply the Gauss Complexity reduction algorithm + */ +typedef enum { + CUBLASLT_3M_MODE_DISALLOWED = 0, + CUBLASLT_3M_MODE_ALLOWED = 1, +} cublasLt3mMode_t; + +/** Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K"). + */ +typedef enum { + /** No reduction scheme, dot-product shall be performed in one sequence. + */ + CUBLASLT_REDUCTION_SCHEME_NONE = 0, + + /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to + * guarantee the sequentiality. + */ + CUBLASLT_REDUCTION_SCHEME_INPLACE = 1, + + /** Intermediate results are stored in compute type in the workspace and reduced in a separate step. + */ + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2, + + /** Intermediate results are stored in output type in the workspace and reduced in a separate step. + */ + CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4, + + CUBLASLT_REDUCTION_SCHEME_MASK = 0x7, +} cublasLtReductionScheme_t; + +/** Postprocessing options for the epilogue + */ +typedef enum { + /** No special postprocessing, just scale and quantize results if necessary. + */ + CUBLASLT_EPILOGUE_DEFAULT = 1, + + /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)). + */ + CUBLASLT_EPILOGUE_RELU = 2, + + /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)). + * + * This epilogue mode produces an extra output, a ReLu bit-mask matrix, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_RELU_AUX = (CUBLASLT_EPILOGUE_RELU | 128), + + /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed + * (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final + * postprocessing. + */ + CUBLASLT_EPILOGUE_BIAS = 4, + + /** ReLu and Bias, apply Bias and then ReLu transform + */ + CUBLASLT_EPILOGUE_RELU_BIAS = (CUBLASLT_EPILOGUE_RELU | CUBLASLT_EPILOGUE_BIAS), + + /** ReLu and Bias, apply Bias and then ReLu transform + * + * This epilogue mode produces an extra output, a ReLu bit-mask matrix, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_RELU_AUX_BIAS = (CUBLASLT_EPILOGUE_RELU_AUX | CUBLASLT_EPILOGUE_BIAS), + + /* ReLu gradient. Apply ReLu gradient to matmul output. Store ReLu gradient in the output matrix. + * + * This epilogue mode requires an extra input, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_DRELU = 8 | 128, + + /* ReLu and Bias gradients. Apply independently ReLu and Bias gradient to + * matmul output. Store ReLu gradient in the output matrix, and Bias gradient + * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER). + * + * This epilogue mode requires an extra input, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_DRELU_BGRAD = CUBLASLT_EPILOGUE_DRELU | 16, + + /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)). + */ + CUBLASLT_EPILOGUE_GELU = 32, + + /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)). + * + * This epilogue mode outputs GELU input as a separate matrix (useful for training). + * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_GELU_AUX = (CUBLASLT_EPILOGUE_GELU | 128), + + /** GELU and Bias, apply Bias and then GELU transform + */ + CUBLASLT_EPILOGUE_GELU_BIAS = (CUBLASLT_EPILOGUE_GELU | CUBLASLT_EPILOGUE_BIAS), + + /** GELU and Bias, apply Bias and then GELU transform + * + * This epilogue mode outputs GELU input as a separate matrix (useful for training). + * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_GELU_AUX_BIAS = (CUBLASLT_EPILOGUE_GELU_AUX | CUBLASLT_EPILOGUE_BIAS), + + /* GELU gradient. Apply GELU gradient to matmul output. Store GELU gradient in the output matrix. + * + * This epilogue mode requires an extra input, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_DGELU = 64 | 128, + + /* GELU and Bias gradients. Apply independently GELU and Bias gradient to + * matmul output. Store GELU gradient in the output matrix, and Bias gradient + * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER). + * + * This epilogue mode requires an extra input, + * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER. + */ + CUBLASLT_EPILOGUE_DGELU_BGRAD = CUBLASLT_EPILOGUE_DGELU | 16, + + /** Bias gradient based on the input matrix A. + * + * The bias size corresponds to the number of rows of the matrix D. + * The reduction happens over the GEMM's "k" dimension. + * + * Stores Bias gradient in the auxiliary output + * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER). + */ + CUBLASLT_EPILOGUE_BGRADA = 256, + + /** Bias gradient based on the input matrix B. + * + * The bias size corresponds to the number of columns of the matrix D. + * The reduction happens over the GEMM's "k" dimension. + * + * Stores Bias gradient in the auxiliary output + * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER). + */ + CUBLASLT_EPILOGUE_BGRADB = 512, +} cublasLtEpilogue_t; + +/** Matmul heuristic search mode + */ +typedef enum { + /** ask heuristics for best algo for given usecase + */ + CUBLASLT_SEARCH_BEST_FIT = 0, + /** only try to find best config for preconfigured algo id + */ + CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1, + /** reserved for future use + */ + CUBLASLT_SEARCH_RESERVED_02 = 2, + /** reserved for future use + */ + CUBLASLT_SEARCH_RESERVED_03 = 3, + /** reserved for future use + */ + CUBLASLT_SEARCH_RESERVED_04 = 4, + /** reserved for future use + */ + CUBLASLT_SEARCH_RESERVED_05 = 5, +} cublasLtMatmulSearch_t; + +/** Algo search preference to fine tune the heuristic function. */ +typedef enum { + /** Search mode, see cublasLtMatmulSearch_t. + * + * uint32_t, default: CUBLASLT_SEARCH_BEST_FIT + */ + CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0, + + /** Maximum allowed workspace size in bytes. + * + * uint64_t, default: 0 - no workspace allowed + */ + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1, + + /** Math mode mask, see cublasMath_t. + * + * Only algorithms with CUBLASLT_ALGO_CAP_MATHMODE_IMPL that is not masked out by this attribute are allowed. + * + * uint32_t, default: 1 (allows both default and tensor op math) + * DEPRECATED, will be removed in a future release, see cublasLtNumericalImplFlags_t for replacement + */ + CUBLASLT_MATMUL_PREF_MATH_MODE_MASK = 2, + + /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that + * use one of the required modes. + * + * E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes. + * + * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes) + */ + CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3, + + /** Gaussian mode mask, see cublasLt3mMode_t. + * + * Only algorithms with CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL that is not masked out by this attribute are allowed. + * + * uint32_t, default: CUBLASLT_3M_MODE_ALLOWED (allows both gaussian and non-gaussian algorithms) + * DEPRECATED, will be removed in a future release, see cublasLtNumericalImplFlags_t for replacement + */ + CUBLASLT_MATMUL_PREF_GAUSSIAN_MODE_MASK = 4, + + /** Minimum buffer alignment for matrix A (in bytes). + * + * Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned + * as they need. + * + * uint32_t, default: 256 + */ + CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5, + + /** Minimum buffer alignment for matrix B (in bytes). + * + * Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned + * as they need. + * + * uint32_t, default: 256 + */ + CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6, + + /** Minimum buffer alignment for matrix C (in bytes). + * + * Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned + * as they need. + * + * uint32_t, default: 256 + */ + CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7, + + /** Minimum buffer alignment for matrix D (in bytes). + * + * Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned + * as they need. + * + * uint32_t, default: 256 + */ + CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8, + + /** Maximum wave count. + * + * See cublasLtMatmulHeuristicResult_t::wavesCount. + * + * Selecting a non-zero value will exclude algorithms that report device utilization higher than specified. + * + * float, default: 0.0f + */ + CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9, + + /** Pointer mode mask, see cublasLtPointerModeMask_t. Filters heuristic result to only include algorithms that support + * all required modes. + * + * uint32_t, default: (CUBLASLT_POINTER_MODE_MASK_HOST | CUBLASLT_POINTER_MODE_MASK_DEVICE) (only allows algorithms + * that support both regular host and device pointers) + */ + CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK = 10, + + /** Epilogue selector mask, see cublasLtEpilogue_t. Filters heuristic result to only include algorithms that support + * all required operations. + * + * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT (only allows algorithms that support default epilogue) + */ + CUBLASLT_MATMUL_PREF_EPILOGUE_MASK = 11, + + /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include + * algorithms that use the allowed implementations. + * + * uint64_t, default: uint64_t(-1) (allow everything) + */ + CUBLASLT_MATMUL_PREF_IMPL_MASK = 12, + + /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs + * when user expects a concurrent stream to be using some of the device resources. + * + * Overrides the SM count target set in the matrix multiplication descriptor (see cublasLtMatmulDescAttributes_t). + * + * int32_t, default: 0 - use the number reported by the device. + * DEPRECATED, will be removed in a future release, see cublasLtMatmulDescAttributes_t for replacement + */ + CUBLASLT_MATMUL_PREF_SM_COUNT_TARGET = 13, +} cublasLtMatmulPreferenceAttributes_t; + +/** Internal. Do not use directly. + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size); + +/** Initialize matmul heuristic search preference descriptor in pre-allocated space. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if size of the pre-allocated space is insufficient + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +static inline cublasStatus_t cublasLtMatmulPreferenceInit(cublasLtMatmulPreference_t pref) { + return cublasLtMatmulPreferenceInit_internal(pref, sizeof(*pref)); +} + +/** Create new matmul heuristic search preference descriptor. + * + * \retval CUBLAS_STATUS_ALLOC_FAILED if memory could not be allocated + * \retval CUBLAS_STATUS_SUCCESS if desciptor was created successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t* pref); + +/** Destroy matmul heuristic search preference descriptor. + * + * \retval CUBLAS_STATUS_SUCCESS if operation was successful + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref); + +/** Set matmul heuristic search preference descriptor attribute. + * + * \param[in] pref The descriptor + * \param[in] attr The attribute + * \param[in] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * + * \retval CUBLAS_STATUS_INVALID_VALUE if buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute( // + cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, + const void* buf, + size_t sizeInBytes); + +/** Get matmul heuristic search preference descriptor attribute. + * + * \param[in] pref The descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of + * bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute( // + cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/** Results structure used by cublasLtMatmulGetAlgo. + * + * Holds returned configured algo descriptor and its runtime properties. + */ +typedef struct { + /** Matmul algorithm descriptor. + * + * Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to + * CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID + */ + cublasLtMatmulAlgo_t algo; + + /** Actual size of workspace memory required. + */ + size_t workspaceSize; + + /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to + * CUBLAS_STATUS_SUCCESS. + */ + cublasStatus_t state; + + /** Waves count - a device utilization metric. + * + * wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU. + */ + float wavesCount; + + int reserved[4]; +} cublasLtMatmulHeuristicResult_t; + +/** Query cublasLt heuristic for algorithm appropriate for given use case. + * + * \param[in] lightHandle Pointer to the allocated cuBLASLt handle for the cuBLASLt + * context. See cublasLtHandle_t. + * \param[in] operationDesc Handle to the matrix multiplication descriptor. + * \param[in] Adesc Handle to the layout descriptors for matrix A. + * \param[in] Bdesc Handle to the layout descriptors for matrix B. + * \param[in] Cdesc Handle to the layout descriptors for matrix C. + * \param[in] Ddesc Handle to the layout descriptors for matrix D. + * \param[in] preference Pointer to the structure holding the heuristic search + * preferences descriptor. See cublasLtMatrixLayout_t. + * \param[in] requestedAlgoCount Size of heuristicResultsArray (in elements) and requested + * maximum number of algorithms to return. + * \param[in, out] heuristicResultsArray Output algorithms and associated runtime characteristics, + * ordered in increasing estimated compute time. + * \param[out] returnAlgoCount The number of heuristicResultsArray elements written. + * + * \retval CUBLAS_STATUS_INVALID_VALUE if requestedAlgoCount is less or equal to zero + * \retval CUBLAS_STATUS_NOT_SUPPORTED if no heuristic function available for current configuration + * \retval CUBLAS_STATUS_SUCCESS if query was successful, inspect + * heuristicResultsArray[0 to (returnAlgoCount - 1)].state + * for detail status of results + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t operationDesc, + cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, + cublasLtMatrixLayout_t Ddesc, + cublasLtMatmulPreference_t preference, + int requestedAlgoCount, + cublasLtMatmulHeuristicResult_t heuristicResultsArray[], + int* returnAlgoCount); + +/* ---------------------------------------------------------------------------------------*/ +/* Lower level API to be able to implement own Heuristic and Find routines */ +/* ---------------------------------------------------------------------------------------*/ + +/** Routine to get all algo IDs that can potentially run + * + * \param[in] int requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA + * (in elements)) \param[out] algoIdsA array to write algoIds to \param[out] returnAlgoCount number of algoIds + * actually written + * + * \retval CUBLAS_STATUS_INVALID_VALUE if requestedAlgoCount is less or equal to zero + * \retval CUBLAS_STATUS_SUCCESS if query was successful, inspect returnAlgoCount to get actual number of IDs + * available + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(cublasLtHandle_t lightHandle, + cublasComputeType_t computeType, + cudaDataType_t scaleType, + cudaDataType_t Atype, + cudaDataType_t Btype, + cudaDataType_t Ctype, + cudaDataType_t Dtype, + int requestedAlgoCount, + int algoIdsArray[], + int* returnAlgoCount); + +/** Initialize algo structure + * + * \retval CUBLAS_STATUS_INVALID_VALUE if algo is NULL or algoId is outside of recognized range + * \retval CUBLAS_STATUS_NOT_SUPPORTED if algoId is not supported for given combination of data types + * \retval CUBLAS_STATUS_SUCCESS if the structure was successfully initialized + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(cublasLtHandle_t lightHandle, + cublasComputeType_t computeType, + cudaDataType_t scaleType, + cudaDataType_t Atype, + cudaDataType_t Btype, + cudaDataType_t Ctype, + cudaDataType_t Dtype, + int algoId, + cublasLtMatmulAlgo_t* algo); + +/** Check configured algo descriptor for correctness and support on current device. + * + * Result includes required workspace size and calculated wave count. + * + * CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned); + * but if cublasLtMatmulAlgoCheck fails, the algo will not run. + * + * \param[in] algo algo configuration to check + * \param[out] result result structure to report algo runtime characteristics; algo field is never updated + * + * \retval CUBLAS_STATUS_INVALID_VALUE if matrix layout descriptors or operation descriptor don't match algo + * descriptor + * \retval CUBLAS_STATUS_NOT_SUPPORTED if algo configuration or data type combination is not currently supported on + * given device + * \retval CUBLAS_STATUS_ARCH_MISMATCH if algo configuration cannot be run using the selected device + * \retval CUBLAS_STATUS_SUCCESS if check was successful + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck( // + cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t operationDesc, + cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, + cublasLtMatrixLayout_t Ddesc, + const cublasLtMatmulAlgo_t* algo, ///< may point to result->algo + cublasLtMatmulHeuristicResult_t* result); + +/** Capabilities Attributes that can be retrieved from an initialized Algo structure + */ +typedef enum { + /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM + * + * int32_t, 0 means no support, supported otherwise + */ + CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0, + /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is + * not masked out it is supported. + * + * e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) == + * CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0; + * + * uint32_t + */ + CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1, + /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING + * + * uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved + */ + CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2, + /** support strided batch + * + * int32_t, 0 means no support, supported otherwise + */ + CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3, + /** support results out of place (D != C in D = alpha.A.B + beta.C) + * + * int32_t, 0 means no support, supported otherwise + */ + CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4, + /** syrk/herk support (on top of regular gemm) + * + * int32_t, 0 means no support, supported otherwise + */ + CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5, + /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use + * CUBLASLT_MATMUL_TILE_UNDEFINED + * + * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count + * + * array of uint32_t + */ + CUBLASLT_ALGO_CAP_TILE_IDS = 6, + /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see + * CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION + * + * int32_t + */ + CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7, + /** whether algorithm is using regular compute or tensor operations + * + * int32_t 0 means regular compute, 1 means tensor operations; + * DEPRECATED + */ + CUBLASLT_ALGO_CAP_MATHMODE_IMPL = 8, + /** whether algorithm implements gaussian optimization of complex matrix multiplication, see cublasMath_t + * + * int32_t 0 means regular compute, 1 means gaussian; + * DEPRECATED + */ + CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL = 9, + /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t + * + * int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different + * requirements; + */ + CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10, + + /** bitmask enumerating pointer modes algorithm supports + * + * uint32_t, see cublasLtPointerModeMask_t + */ + CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11, + + /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue + * + * uint32_t, see cublasLtEpilogue_t + */ + CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12, + /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use + * CUBLASLT_MATMUL_STAGES_UNDEFINED + * + * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count + * + * array of uint32_t + */ + CUBLASLT_ALGO_CAP_STAGES_IDS = 13, + /** support for nagative ld for all of the matrices + * + * int32_t 0 means no support, supported otherwise + */ + CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14, + /** details about algorithm's implementation that affect it's numerical behavior + * + * uint64_t, see cublasLtNumericalImplFlags_t + */ + CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15, + /** minimum alignment required for A matrix in bytes + * (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order) + * + * uint32_t + */ + CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16, + /** minimum alignment required for B matrix in bytes + * (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order) + * + * uint32_t + */ + CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17, + /** minimum alignment required for C matrix in bytes + * (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order) + * + * uint32_t + */ + CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18, + /** minimum alignment required for D matrix in bytes + * (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order) + * + * uint32_t + */ + CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19, +} cublasLtMatmulAlgoCapAttributes_t; + +/** Get algo capability attribute. + * + * E.g. to get list of supported Tile IDs: + * cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END]; + * size_t num_tiles, size_written; + * if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) == + * CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]); + * } + * + * \param[in] algo The algo descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of + * bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(const cublasLtMatmulAlgo_t* algo, + cublasLtMatmulAlgoCapAttributes_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/** Algo Configuration Attributes that can be set according to the Algo capabilities + */ +typedef enum { + /** algorithm index, see cublasLtMatmulAlgoGetIds() + * + * readonly, set by cublasLtMatmulAlgoInit() + * int32_t + */ + CUBLASLT_ALGO_CONFIG_ID = 0, + /** tile id, see cublasLtMatmulTile_t + * + * uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED + */ + CUBLASLT_ALGO_CONFIG_TILE_ID = 1, + /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts + * of matrix multiplication will be computed in parallel. The results will be accumulated + * according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME + * + * int32_t, default: 1 + */ + CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2, + /** reduction scheme, see cublasLtReductionScheme_t + * + * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE + */ + CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3, + /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices + * + * possible values: 0, 1, other values reserved + * + * uint32_t, default: 0 + */ + CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4, + /** custom option, each algorithm can support some custom options that don't fit description of the other config + * attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case + * + * uint32_t, default: 0 + */ + CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5, + /** stages id, see cublasLtMatmulStages_t + * + * uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED + */ + CUBLASLT_ALGO_CONFIG_STAGES_ID = 6, + /** inner shape id, see cublasLtMatmulInnerShape_t + * + * uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED) + */ + CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7, + /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use. + * + * uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO) + */ + CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8, +} cublasLtMatmulAlgoConfigAttributes_t; + +/** Set algo configuration attribute. + * + * \param[in] algo The algo descriptor + * \param[in] attr The attribute + * \param[in] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * + * \retval CUBLAS_STATUS_INVALID_VALUE if buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(cublasLtMatmulAlgo_t* algo, + cublasLtMatmulAlgoConfigAttributes_t attr, + const void* buf, + size_t sizeInBytes); + +/** Get algo configuration attribute. + * + * \param[in] algo The algo descriptor + * \param[in] attr The attribute + * \param[out] buf memory address containing the new value + * \param[in] sizeInBytes size of buf buffer for verification (in bytes) + * \param[out] sizeWritten only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of + * bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents + * + * \retval CUBLAS_STATUS_INVALID_VALUE if sizeInBytes is 0 and sizeWritten is NULL, or if sizeInBytes is non-zero + * and buf is NULL or sizeInBytes doesn't match size of internal storage for + * selected attribute + * \retval CUBLAS_STATUS_SUCCESS if attribute's value was successfully written to user memory + */ +cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(const cublasLtMatmulAlgo_t* algo, + cublasLtMatmulAlgoConfigAttributes_t attr, + void* buf, + size_t sizeInBytes, + size_t* sizeWritten); + +/** Experimental: Logger callback type. + */ +typedef void (*cublasLtLoggerCallback_t)(int logLevel, const char* functionName, const char* message); + +/** Experimental: Logger callback setter. + * + * \param[in] callback a user defined callback function to be called by the logger + * + * \retval CUBLAS_STATUS_SUCCESS if callback was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerSetCallback(cublasLtLoggerCallback_t callback); + +/** Experimental: Log file setter. + * + * \param[in] file an open file with write permissions + * + * \retval CUBLAS_STATUS_SUCCESS if log file was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerSetFile(FILE* file); + +/** Experimental: Open log file. + * + * \param[in] logFile log file path. if the log file does not exist, it will be created + * + * \retval CUBLAS_STATUS_SUCCESS if log file was created successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerOpenFile(const char* logFile); + +/** Experimental: Log level setter. + * + * \param[in] level log level, should be one of the following: + * 0. Off + * 1. Errors + * 2. Performance Trace + * 3. Performance Hints + * 4. Heuristics Trace + * 5. API Trace + * + * \retval CUBLAS_STATUS_INVALID_VALUE if log level is not one of the above levels + * + * \retval CUBLAS_STATUS_SUCCESS if log level was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerSetLevel(int level); + +/** Experimental: Log mask setter. + * + * \param[in] mask log mask, should be a combination of the following masks: + * 0. Off + * 1. Errors + * 2. Performance Trace + * 4. Performance Hints + * 8. Heuristics Trace + * 16. API Trace + * + * \retval CUBLAS_STATUS_SUCCESS if log mask was set successfully + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerSetMask(int mask); + +/** Experimental: Disable logging for the entire session. + * + * \retval CUBLAS_STATUS_SUCCESS if disabled logging + */ +cublasStatus_t CUBLASWINAPI cublasLtLoggerForceDisable(); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h new file mode 100644 index 0000000000000000000000000000000000000000..aec546259cff3ece866faf6dd35f5909a82d4d23 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h @@ -0,0 +1,3478 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the CUBLAS library, defining the API + * + * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) + * on top of the CUDA runtime. + */ + +#if !defined(CUBLAS_API_H_) +#define CUBLAS_API_H_ + +#ifndef CUBLASWINAPI +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif +#endif + +#ifndef CUBLASAPI +#error "This file should not be included without defining CUBLASAPI" +#endif + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#include +#include + +#include + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#define CUBLAS_VER_MAJOR 11 +#define CUBLAS_VER_MINOR 11 +#define CUBLAS_VER_PATCH 3 +#define CUBLAS_VER_BUILD 6 +#define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH) + +/* CUBLAS status type returns */ +typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + CUBLAS_STATUS_NOT_INITIALIZED = 1, + CUBLAS_STATUS_ALLOC_FAILED = 3, + CUBLAS_STATUS_INVALID_VALUE = 7, + CUBLAS_STATUS_ARCH_MISMATCH = 8, + CUBLAS_STATUS_MAPPING_ERROR = 11, + CUBLAS_STATUS_EXECUTION_FAILED = 13, + CUBLAS_STATUS_INTERNAL_ERROR = 14, + CUBLAS_STATUS_NOT_SUPPORTED = 15, + CUBLAS_STATUS_LICENSE_ERROR = 16 +} cublasStatus_t; + +typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t; + +typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t; + +typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t; + +typedef enum { + CUBLAS_OP_N = 0, + CUBLAS_OP_T = 1, + CUBLAS_OP_C = 2, + CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */ + CUBLAS_OP_CONJG = 3 /* conjugate, placeholder - not supported in the current release */ +} cublasOperation_t; + +typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t; + +typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t; + +/*For different GEMM algorithm */ +typedef enum { + CUBLAS_GEMM_DFALT = -1, + CUBLAS_GEMM_DEFAULT = -1, + CUBLAS_GEMM_ALGO0 = 0, + CUBLAS_GEMM_ALGO1 = 1, + CUBLAS_GEMM_ALGO2 = 2, + CUBLAS_GEMM_ALGO3 = 3, + CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO5 = 5, + CUBLAS_GEMM_ALGO6 = 6, + CUBLAS_GEMM_ALGO7 = 7, + CUBLAS_GEMM_ALGO8 = 8, + CUBLAS_GEMM_ALGO9 = 9, + CUBLAS_GEMM_ALGO10 = 10, + CUBLAS_GEMM_ALGO11 = 11, + CUBLAS_GEMM_ALGO12 = 12, + CUBLAS_GEMM_ALGO13 = 13, + CUBLAS_GEMM_ALGO14 = 14, + CUBLAS_GEMM_ALGO15 = 15, + CUBLAS_GEMM_ALGO16 = 16, + CUBLAS_GEMM_ALGO17 = 17, + CUBLAS_GEMM_ALGO18 = 18, // sliced 32x32 + CUBLAS_GEMM_ALGO19 = 19, // sliced 64x32 + CUBLAS_GEMM_ALGO20 = 20, // sliced 128x32 + CUBLAS_GEMM_ALGO21 = 21, // sliced 32x32 -splitK + CUBLAS_GEMM_ALGO22 = 22, // sliced 64x32 -splitK + CUBLAS_GEMM_ALGO23 = 23, // sliced 128x32 -splitK + CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, + CUBLAS_GEMM_DFALT_TENSOR_OP = 99, + CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, + CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, + CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, + CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, + CUBLAS_GEMM_ALGO4_TENSOR_OP = 104, + CUBLAS_GEMM_ALGO5_TENSOR_OP = 105, + CUBLAS_GEMM_ALGO6_TENSOR_OP = 106, + CUBLAS_GEMM_ALGO7_TENSOR_OP = 107, + CUBLAS_GEMM_ALGO8_TENSOR_OP = 108, + CUBLAS_GEMM_ALGO9_TENSOR_OP = 109, + CUBLAS_GEMM_ALGO10_TENSOR_OP = 110, + CUBLAS_GEMM_ALGO11_TENSOR_OP = 111, + CUBLAS_GEMM_ALGO12_TENSOR_OP = 112, + CUBLAS_GEMM_ALGO13_TENSOR_OP = 113, + CUBLAS_GEMM_ALGO14_TENSOR_OP = 114, + CUBLAS_GEMM_ALGO15_TENSOR_OP = 115 +} cublasGemmAlgo_t; + +/*Enum for default math mode/tensor operation*/ +typedef enum { + CUBLAS_DEFAULT_MATH = 0, + + /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */ + CUBLAS_TENSOR_OP_MATH = 1, + + /* same as using matching _PEDANTIC compute type when using cublasroutine calls or cublasEx() calls with + cudaDataType as compute type */ + CUBLAS_PEDANTIC_MATH = 2, + + /* allow accelerating single precision routines using TF32 tensor cores */ + CUBLAS_TF32_TENSOR_OP_MATH = 3, + + /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines + with lower size output type */ + CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16, +} cublasMath_t; + +/* For backward compatibility purposes */ +typedef cudaDataType cublasDataType_t; + +/* Enum for compute type + * + * - default types provide best available performance using all available hardware features + * and guarantee internal storage precision with at least the same precision and range; + * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format; + * - _FAST types allow for some loss of precision to enable higher throughput arithmetic. + */ +typedef enum { + CUBLAS_COMPUTE_16F = 64, /* half - default */ + CUBLAS_COMPUTE_16F_PEDANTIC = 65, /* half - pedantic */ + CUBLAS_COMPUTE_32F = 68, /* float - default */ + CUBLAS_COMPUTE_32F_PEDANTIC = 69, /* float - pedantic */ + CUBLAS_COMPUTE_32F_FAST_16F = 74, /* float - fast, allows down-converting inputs to half or TF32 */ + CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */ + CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */ + CUBLAS_COMPUTE_64F = 70, /* double - default */ + CUBLAS_COMPUTE_64F_PEDANTIC = 71, /* double - pedantic */ + CUBLAS_COMPUTE_32I = 72, /* signed 32-bit int - default */ + CUBLAS_COMPUTE_32I_PEDANTIC = 73, /* signed 32-bit int - pedantic */ +} cublasComputeType_t; + +/* Opaque structure holding CUBLAS library context */ +struct cublasContext; +typedef struct cublasContext* cublasHandle_t; + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int* version); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int* value); +CUBLASAPI size_t CUBLASWINAPI cublasGetCudartVersion(void); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget); + +CUBLASAPI const char* CUBLASWINAPI cublasGetStatusName(cublasStatus_t status); +CUBLASAPI const char* CUBLASWINAPI cublasGetStatusString(cublasStatus_t status); + +/* Cublas logging */ +typedef void (*cublasLogCallback)(const char* msg); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, + int logToStdOut, + int logToStdErr, + const char* logFileName); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback); + +/* + * cublasStatus_t + * cublasSetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in CPU memory space to a vector y + * in GPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, y points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy); + +/* + * cublasStatus_t + * cublasGetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in GPU memory space to a vector y + * in CPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, x points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy); + +/* + * cublasStatus_t + * cublasSetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in CPU memory + * space to a matrix B in GPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, B points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb); + +/* + * cublasStatus_t + * cublasGetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in GPU memory + * space to a matrix B in CPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, A points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb); + +/* + * cublasStatus + * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream ); + * + * cublasSetVectorAsync has the same functionnality as cublasSetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVectorAsync( + int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream); +/* + * cublasStatus + * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream) + * + * cublasGetVectorAsync has the same functionnality as cublasGetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVectorAsync( + int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream); + +/* + * cublasStatus_t + * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI +cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream); + +/* + * cublasStatus_t + * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI +cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream); + +CUBLASAPI void CUBLASWINAPI cublasXerbla(const char* srName, int info); +/* ---------------- CUBLAS BLAS1 functions ---------------- */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + void* result, + cudaDataType resultType, + cudaDataType executionType); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + const void* y, + cudaDataType yType, + int incy, + void* result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + const void* y, + cudaDataType yType, + int incy, + void* result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, + int n, + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, + int n, + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, + int n, + const void* alpha, /* host or device pointer */ + cudaDataType alphaType, + void* x, + cudaDataType xType, + int incx, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, + int n, + const cuComplex* alpha, /* host or device pointer */ + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + cuDoubleComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + cuDoubleComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx(cublasHandle_t handle, + int n, + const void* alpha, /* host or device pointer */ + cudaDataType alphaType, + const void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2(cublasHandle_t handle, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx( + cublasHandle_t handle, int n, void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIamaxEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */ +); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIaminEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */ +); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + void* result, + cudaDataType resultType, /* host or device pointer */ + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2(cublasHandle_t handle, + int n, + float* x, + int incx, + float* y, + int incy, + const float* c, /* host or device pointer */ + const float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2(cublasHandle_t handle, + int n, + double* x, + int incx, + double* y, + int incy, + const double* c, /* host or device pointer */ + const double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2(cublasHandle_t handle, + int n, + cuComplex* x, + int incx, + cuComplex* y, + int incy, + const float* c, /* host or device pointer */ + const cuComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, + int n, + cuComplex* x, + int incx, + cuComplex* y, + int incy, + const float* c, /* host or device pointer */ + const float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2(cublasHandle_t handle, + int n, + cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy, + const double* c, /* host or device pointer */ + const cuDoubleComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, + int n, + cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy, + const double* c, /* host or device pointer */ + const double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx(cublasHandle_t handle, + int n, + void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + const void* c, /* host or device pointer */ + const void* s, + cudaDataType csType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, + float* a, /* host or device pointer */ + float* b, /* host or device pointer */ + float* c, /* host or device pointer */ + float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, + double* a, /* host or device pointer */ + double* b, /* host or device pointer */ + double* c, /* host or device pointer */ + double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, + cuComplex* a, /* host or device pointer */ + cuComplex* b, /* host or device pointer */ + float* c, /* host or device pointer */ + cuComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, + cuDoubleComplex* a, /* host or device pointer */ + cuDoubleComplex* b, /* host or device pointer */ + double* c, /* host or device pointer */ + cuDoubleComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle, + void* a, /* host or device pointer */ + void* b, /* host or device pointer */ + cudaDataType abType, + void* c, /* host or device pointer */ + void* s, /* host or device pointer */ + cudaDataType csType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, + int n, + float* x, + int incx, + float* y, + int incy, + const float* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, + int n, + double* x, + int incx, + double* y, + int incy, + const double* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle, + int n, + void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + const void* param, /* host or device pointer */ + cudaDataType paramType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, + float* d1, /* host or device pointer */ + float* d2, /* host or device pointer */ + float* x1, /* host or device pointer */ + const float* y1, /* host or device pointer */ + float* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, + double* d1, /* host or device pointer */ + double* d2, /* host or device pointer */ + double* x1, /* host or device pointer */ + const double* y1, /* host or device pointer */ + double* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle, + void* d1, /* host or device pointer */ + cudaDataType d1Type, + void* d2, /* host or device pointer */ + cudaDataType d2Type, + void* x1, /* host or device pointer */ + cudaDataType x1Type, + const void* y1, /* host or device pointer */ + cudaDataType y1Type, + void* param, /* host or device pointer */ + cudaDataType paramType, + cudaDataType executiontype); +/* --------------- CUBLAS BLAS2 functions ---------------- */ + +/* GEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); +/* GBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* TRMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* AP, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* AP, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* AP, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* AP, + cuDoubleComplex* x, + int incx); + +/* TRSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TPSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* AP, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* AP, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* AP, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* AP, + cuDoubleComplex* x, + int incx); +/* TBSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* SYMV/HEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* SBMV/HBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* SPMV/HPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* AP, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* AP, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* AP, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* AP, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* GER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2(cublasHandle_t handle, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2(cublasHandle_t handle, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2(cublasHandle_t handle, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2(cublasHandle_t handle, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2(cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2(cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +/* SYR/HER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* A, + int lda); + +/* SPR/HPR */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* AP); + +/* SYR2/HER2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +/* SPR2/HPR2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* AP); +/* BATCH GEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* const Aarray[], + int lda, + const float* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* const Aarray[], + int lda, + const double* const xarray[], + int incx, + const double* beta, /* host or device pointer */ + double* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const xarray[], + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* const Aarray[], + int lda, + const cuDoubleComplex* const xarray[], + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* const yarray[], + int incy, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + __half* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* const Aarray[], + int lda, + const __nv_bfloat16* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + __nv_bfloat16* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* const Aarray[], + int lda, + const __nv_bfloat16* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); +#endif + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + long long int strideA, /* purposely signed */ + const float* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + long long int strideA, /* purposely signed */ + const double* x, + int incx, + long long int stridex, + const double* beta, /* host or device pointer */ + double* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* x, + int incx, + long long int stridex, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuDoubleComplex* x, + int incx, + long long int stridex, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy, + long long int stridey, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + __half* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* A, + int lda, + long long int strideA, /* purposely signed */ + const __nv_bfloat16* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + __nv_bfloat16* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* A, + int lda, + long long int strideA, /* purposely signed */ + const __nv_bfloat16* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); +#endif +/* ---------------- CUBLAS BLAS3 functions ---------------- */ + +/* GEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* A, + int lda, + const __half* B, + int ldb, + const __half* beta, /* host or device pointer */ + __half* C, + int ldc); +#endif +/* IO in FP16/FP32, computation in float */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const float* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + cublasOperation_t transc, + int m, + int n, + int k, + const unsigned char* A, + int A_bias, + int lda, + const unsigned char* B, + int B_bias, + int ldb, + unsigned char* C, + int C_bias, + int ldc, + int C_mult, + int C_shift); + +/* SYRK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const cuComplex* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +/* HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const float* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const void* A, + cudaDataType Atype, + int lda, + const float* beta, + void* C, + cudaDataType Ctype, + int ldc); + +/* SYR2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* HER2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* SYRKX : eXtended SYRK*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* HERKX : eXtended HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* SYMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* HEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* TRSM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + float* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + double* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + cuComplex* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* B, + int ldb); + +/* TRMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex* C, + int ldc); +/* BATCH GEMM */ +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const Barray[], + int ldb, + const __half* beta, /* host or device pointer */ + __half* const Carray[], + int ldc, + int batchCount); +#endif +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* const Aarray[], + int lda, + const float* const Barray[], + int ldb, + const float* beta, /* host or device pointer */ + float* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* const Aarray[], + int lda, + const double* const Barray[], + int ldb, + const double* beta, /* host or device pointer */ + double* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const Barray[], + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const Barray[], + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* const Aarray[], + int lda, + const cuDoubleComplex* const Barray[], + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* const Aarray[], + cudaDataType Atype, + int lda, + const void* const Barray[], + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* const Carray[], + cudaDataType Ctype, + int ldc, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + long long int strideA, /* purposely signed */ + const void* B, + cudaDataType Btype, + int ldb, + long long int strideB, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + long long int strideC, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + long long int strideA, /* purposely signed */ + const float* B, + int ldb, + long long int strideB, + const float* beta, /* host or device pointer */ + float* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + long long int strideA, /* purposely signed */ + const double* B, + int ldb, + long long int strideB, + const double* beta, /* host or device pointer */ + double* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* B, + int ldb, + long long int strideB, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* B, + int ldb, + long long int strideB, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuDoubleComplex* B, + int ldb, + long long int strideB, + const cuDoubleComplex* beta, /* host or device poi */ + cuDoubleComplex* C, + int ldc, + long long int strideC, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* B, + int ldb, + long long int strideB, + const __half* beta, /* host or device pointer */ + __half* C, + int ldc, + long long int strideC, + int batchCount); +#endif +/* ---------------- CUBLAS BLAS-like extension ---------------- */ +/* GEAM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* beta, /* host or device pointer */ + const float* B, + int ldb, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* beta, /* host or device pointer */ + const double* B, + int ldb, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* beta, /* host or device pointer */ + const cuComplex* B, + int ldb, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* beta, /* host or device pointer */ + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex* C, + int ldc); + +/* Batched LU - GETRF*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, + int n, + float* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, + int n, + double* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle, + int n, + cuComplex* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle, + int n, + cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +/* Batched inversion based on LU factorization from getrf */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle, + int n, + const float* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + float* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle, + int n, + const double* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + double* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle, + int n, + const cuComplex* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + cuComplex* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + cuDoubleComplex* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +/* Batched solver based on LU factorization from getrf */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const float* const Aarray[], + int lda, + const int* devIpiv, + float* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const double* const Aarray[], + int lda, + const int* devIpiv, + double* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuComplex* const Aarray[], + int lda, + const int* devIpiv, + cuComplex* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuDoubleComplex* const Aarray[], + int lda, + const int* devIpiv, + cuDoubleComplex* const Barray[], + int ldb, + int* info, + int batchSize); + +/* TRSM - Batched Triangular Solver */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /*Host or Device Pointer*/ + const float* const A[], + int lda, + float* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /*Host or Device Pointer*/ + const double* const A[], + int lda, + double* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /*Host or Device Pointer*/ + const cuComplex* const A[], + int lda, + cuComplex* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /*Host or Device Pointer*/ + const cuDoubleComplex* const A[], + int lda, + cuDoubleComplex* const B[], + int ldb, + int batchCount); + +/* Batched - MATINV*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle, + int n, + const float* const A[], /*Device pointer*/ + int lda, + float* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle, + int n, + const double* const A[], /*Device pointer*/ + int lda, + double* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle, + int n, + const cuComplex* const A[], /*Device pointer*/ + int lda, + cuComplex* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + cuDoubleComplex* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +/* Batch QR Factorization */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(cublasHandle_t handle, + int m, + int n, + float* const Aarray[], /*Device pointer*/ + int lda, + float* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(cublasHandle_t handle, + int m, + int n, + double* const Aarray[], /*Device pointer*/ + int lda, + double* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(cublasHandle_t handle, + int m, + int n, + cuComplex* const Aarray[], /*Device pointer*/ + int lda, + cuComplex* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(cublasHandle_t handle, + int m, + int n, + cuDoubleComplex* const Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); +/* Least Square Min only m >= n and Non-transpose supported */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* const Aarray[], /*Device pointer*/ + int lda, + float* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, /*Device pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* const Aarray[], /*Device pointer*/ + int lda, + double* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, /*Device pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuComplex* const Aarray[], /*Device pointer*/ + int lda, + cuComplex* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuDoubleComplex* const Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, + int batchSize); +/* DGMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const float* A, + int lda, + const float* x, + int incx, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const double* A, + int lda, + const double* x, + int incx, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* C, + int ldc); + +/* TPTTR : Triangular Pack format to Triangular format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr( + cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda); +/* TRTTP : Triangular format to Triangular Pack format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp( + cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP); + +#if defined(__cplusplus) +} + +static inline cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle, + cudaDataType_t dataType, + cublasComputeType_t* computeType) { + cublasMath_t mathMode = CUBLAS_DEFAULT_MATH; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + status = cublasGetMathMode(handle, &mathMode); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + bool isPedantic = ((mathMode & 0xf) == CUBLAS_PEDANTIC_MATH); + + switch (dataType) { + case CUDA_R_32F: + case CUDA_C_32F: + *computeType = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_64F: + case CUDA_C_64F: + *computeType = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_16F: + *computeType = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_32I: + *computeType = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I; + return CUBLAS_STATUS_SUCCESS; + default: + return CUBLAS_STATUS_NOT_SUPPORTED; + } +} +/* wrappers to accept old code with cudaDataType computeType when referenced from c++ code */ +static inline cublasStatus_t cublasGemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType = CUBLAS_COMPUTE_32F; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + migratedComputeType, + algo); +} + +static inline cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* const Aarray[], + cudaDataType Atype, + int lda, + const void* const Barray[], + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* const Carray[], + cudaDataType Ctype, + int ldc, + int batchCount, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType; + cublasStatus_t status; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + Atype, + lda, + Barray, + Btype, + ldb, + beta, + Carray, + Ctype, + ldc, + batchCount, + migratedComputeType, + algo); +} + +static inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + long long int strideA, /* purposely signed */ + const void* B, + cudaDataType Btype, + int ldb, + long long int strideB, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + long long int strideC, + int batchCount, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType; + cublasStatus_t status; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmStridedBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + strideA, + B, + Btype, + ldb, + strideB, + beta, + C, + Ctype, + ldc, + strideC, + batchCount, + migratedComputeType, + algo); +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_API_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..34d6b77ccfb485d39e5bb261a3ebb1ad592ea281 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h @@ -0,0 +1,273 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the new CUBLAS library API, it mapped the generic + * Cublas name functions to the actual _v2 implementations. + */ + +#if !defined(CUBLAS_V2_H_) +#define CUBLAS_V2_H_ + +#undef CUBLASAPI +#ifdef __CUDACC__ +#define CUBLASAPI __host__ __device__ +#else +#define CUBLASAPI +#endif + +#include "cublas_api.h" + +#define cublasCreate cublasCreate_v2 +#define cublasDestroy cublasDestroy_v2 +#define cublasGetVersion cublasGetVersion_v2 +#define cublasSetWorkspace cublasSetWorkspace_v2 +#define cublasSetStream cublasSetStream_v2 +#define cublasGetStream cublasGetStream_v2 +#define cublasGetPointerMode cublasGetPointerMode_v2 +#define cublasSetPointerMode cublasSetPointerMode_v2 + +/* Blas3 Routines */ + +#define cublasSnrm2 cublasSnrm2_v2 +#define cublasDnrm2 cublasDnrm2_v2 +#define cublasScnrm2 cublasScnrm2_v2 +#define cublasDznrm2 cublasDznrm2_v2 + +#define cublasSdot cublasSdot_v2 +#define cublasDdot cublasDdot_v2 +#define cublasCdotu cublasCdotu_v2 +#define cublasCdotc cublasCdotc_v2 +#define cublasZdotu cublasZdotu_v2 +#define cublasZdotc cublasZdotc_v2 + +#define cublasSscal cublasSscal_v2 +#define cublasDscal cublasDscal_v2 +#define cublasCscal cublasCscal_v2 +#define cublasCsscal cublasCsscal_v2 +#define cublasZscal cublasZscal_v2 +#define cublasZdscal cublasZdscal_v2 + +#define cublasSaxpy cublasSaxpy_v2 +#define cublasDaxpy cublasDaxpy_v2 +#define cublasCaxpy cublasCaxpy_v2 +#define cublasZaxpy cublasZaxpy_v2 + +#define cublasScopy cublasScopy_v2 +#define cublasDcopy cublasDcopy_v2 +#define cublasCcopy cublasCcopy_v2 +#define cublasZcopy cublasZcopy_v2 + +#define cublasSswap cublasSswap_v2 +#define cublasDswap cublasDswap_v2 +#define cublasCswap cublasCswap_v2 +#define cublasZswap cublasZswap_v2 + +#define cublasIsamax cublasIsamax_v2 +#define cublasIdamax cublasIdamax_v2 +#define cublasIcamax cublasIcamax_v2 +#define cublasIzamax cublasIzamax_v2 + +#define cublasIsamin cublasIsamin_v2 +#define cublasIdamin cublasIdamin_v2 +#define cublasIcamin cublasIcamin_v2 +#define cublasIzamin cublasIzamin_v2 + +#define cublasSasum cublasSasum_v2 +#define cublasDasum cublasDasum_v2 +#define cublasScasum cublasScasum_v2 +#define cublasDzasum cublasDzasum_v2 + +#define cublasSrot cublasSrot_v2 +#define cublasDrot cublasDrot_v2 +#define cublasCrot cublasCrot_v2 +#define cublasCsrot cublasCsrot_v2 +#define cublasZrot cublasZrot_v2 +#define cublasZdrot cublasZdrot_v2 + +#define cublasSrotg cublasSrotg_v2 +#define cublasDrotg cublasDrotg_v2 +#define cublasCrotg cublasCrotg_v2 +#define cublasZrotg cublasZrotg_v2 + +#define cublasSrotm cublasSrotm_v2 +#define cublasDrotm cublasDrotm_v2 + +#define cublasSrotmg cublasSrotmg_v2 +#define cublasDrotmg cublasDrotmg_v2 + +/* Blas2 Routines */ + +#define cublasSgemv cublasSgemv_v2 +#define cublasDgemv cublasDgemv_v2 +#define cublasCgemv cublasCgemv_v2 +#define cublasZgemv cublasZgemv_v2 + +#define cublasSgbmv cublasSgbmv_v2 +#define cublasDgbmv cublasDgbmv_v2 +#define cublasCgbmv cublasCgbmv_v2 +#define cublasZgbmv cublasZgbmv_v2 + +#define cublasStrmv cublasStrmv_v2 +#define cublasDtrmv cublasDtrmv_v2 +#define cublasCtrmv cublasCtrmv_v2 +#define cublasZtrmv cublasZtrmv_v2 + +#define cublasStbmv cublasStbmv_v2 +#define cublasDtbmv cublasDtbmv_v2 +#define cublasCtbmv cublasCtbmv_v2 +#define cublasZtbmv cublasZtbmv_v2 + +#define cublasStpmv cublasStpmv_v2 +#define cublasDtpmv cublasDtpmv_v2 +#define cublasCtpmv cublasCtpmv_v2 +#define cublasZtpmv cublasZtpmv_v2 + +#define cublasStrsv cublasStrsv_v2 +#define cublasDtrsv cublasDtrsv_v2 +#define cublasCtrsv cublasCtrsv_v2 +#define cublasZtrsv cublasZtrsv_v2 + +#define cublasStpsv cublasStpsv_v2 +#define cublasDtpsv cublasDtpsv_v2 +#define cublasCtpsv cublasCtpsv_v2 +#define cublasZtpsv cublasZtpsv_v2 + +#define cublasStbsv cublasStbsv_v2 +#define cublasDtbsv cublasDtbsv_v2 +#define cublasCtbsv cublasCtbsv_v2 +#define cublasZtbsv cublasZtbsv_v2 + +#define cublasSsymv cublasSsymv_v2 +#define cublasDsymv cublasDsymv_v2 +#define cublasCsymv cublasCsymv_v2 +#define cublasZsymv cublasZsymv_v2 +#define cublasChemv cublasChemv_v2 +#define cublasZhemv cublasZhemv_v2 + +#define cublasSsbmv cublasSsbmv_v2 +#define cublasDsbmv cublasDsbmv_v2 +#define cublasChbmv cublasChbmv_v2 +#define cublasZhbmv cublasZhbmv_v2 + +#define cublasSspmv cublasSspmv_v2 +#define cublasDspmv cublasDspmv_v2 +#define cublasChpmv cublasChpmv_v2 +#define cublasZhpmv cublasZhpmv_v2 + +#define cublasSger cublasSger_v2 +#define cublasDger cublasDger_v2 +#define cublasCgeru cublasCgeru_v2 +#define cublasCgerc cublasCgerc_v2 +#define cublasZgeru cublasZgeru_v2 +#define cublasZgerc cublasZgerc_v2 + +#define cublasSsyr cublasSsyr_v2 +#define cublasDsyr cublasDsyr_v2 +#define cublasCsyr cublasCsyr_v2 +#define cublasZsyr cublasZsyr_v2 +#define cublasCher cublasCher_v2 +#define cublasZher cublasZher_v2 + +#define cublasSspr cublasSspr_v2 +#define cublasDspr cublasDspr_v2 +#define cublasChpr cublasChpr_v2 +#define cublasZhpr cublasZhpr_v2 + +#define cublasSsyr2 cublasSsyr2_v2 +#define cublasDsyr2 cublasDsyr2_v2 +#define cublasCsyr2 cublasCsyr2_v2 +#define cublasZsyr2 cublasZsyr2_v2 +#define cublasCher2 cublasCher2_v2 +#define cublasZher2 cublasZher2_v2 + +#define cublasSspr2 cublasSspr2_v2 +#define cublasDspr2 cublasDspr2_v2 +#define cublasChpr2 cublasChpr2_v2 +#define cublasZhpr2 cublasZhpr2_v2 + +/* Blas3 Routines */ + +#define cublasSgemm cublasSgemm_v2 +#define cublasDgemm cublasDgemm_v2 +#define cublasCgemm cublasCgemm_v2 +#define cublasZgemm cublasZgemm_v2 + +#define cublasSsyrk cublasSsyrk_v2 +#define cublasDsyrk cublasDsyrk_v2 +#define cublasCsyrk cublasCsyrk_v2 +#define cublasZsyrk cublasZsyrk_v2 +#define cublasCherk cublasCherk_v2 +#define cublasZherk cublasZherk_v2 + +#define cublasSsyr2k cublasSsyr2k_v2 +#define cublasDsyr2k cublasDsyr2k_v2 +#define cublasCsyr2k cublasCsyr2k_v2 +#define cublasZsyr2k cublasZsyr2k_v2 +#define cublasCher2k cublasCher2k_v2 +#define cublasZher2k cublasZher2k_v2 + +#define cublasSsymm cublasSsymm_v2 +#define cublasDsymm cublasDsymm_v2 +#define cublasCsymm cublasCsymm_v2 +#define cublasZsymm cublasZsymm_v2 +#define cublasChemm cublasChemm_v2 +#define cublasZhemm cublasZhemm_v2 + +#define cublasStrsm cublasStrsm_v2 +#define cublasDtrsm cublasDtrsm_v2 +#define cublasCtrsm cublasCtrsm_v2 +#define cublasZtrsm cublasZtrsm_v2 + +#define cublasStrmm cublasStrmm_v2 +#define cublasDtrmm cublasDtrmm_v2 +#define cublasCtrmm cublasCtrmm_v2 +#define cublasZtrmm cublasZtrmm_v2 + +#endif /* !defined(CUBLAS_V2_H_) */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h new file mode 100644 index 0000000000000000000000000000000000000000..ed965bb5d663bea7ef20593fb4de5cff86136cee --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h @@ -0,0 +1,923 @@ +/* + * Copyright 2020-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_PCSAMPLING_H_) +#define _CUPTI_PCSAMPLING_H_ + +#include +#include +#include +#include "cupti_result.h" + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#define ACTIVITY_RECORD_ALIGNMENT 8 +#if defined(_WIN32) // Windows 32- and 64-bit +#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding +#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT)) +#define END_PACKED_ALIGNMENT __pragma(pack(pop)) +#elif defined(__GNUC__) // GCC +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT))) +#define END_PACKED_ALIGNMENT +#else // all other compilers +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT +#define END_PACKED_ALIGNMENT +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API + * Functions, types, and enums that implement the CUPTI PC Sampling API. + * @{ + */ + +#ifndef CUPTI_PCSAMPLING_STRUCT_SIZE +#define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_) (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_)) +#endif + +#ifndef CUPTI_STALL_REASON_STRING_SIZE +#define CUPTI_STALL_REASON_STRING_SIZE 128 +#endif + +/** + * \brief PC Sampling collection mode + */ +typedef enum +{ + /** + * INVALID Value + */ + CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID = 0, + /** + * Continuous mode. Kernels are not serialized in this mode. + */ + CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS = 1, + /** + * Serialized mode. Kernels are serialized in this mode. + */ + CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED = 2, +} CUpti_PCSamplingCollectionMode; + +/** + * \brief PC Sampling stall reasons + */ +typedef struct PACKED_ALIGNMENT +{ + /** + * [r] Collected stall reason index + */ + uint32_t pcSamplingStallReasonIndex; + /** + * [r] Number of times the PC was sampled with the stallReason. + */ + uint32_t samples; +} CUpti_PCSamplingStallReason; + +/** + * \brief PC Sampling data + */ +typedef struct PACKED_ALIGNMENT +{ + /** + * [w] Size of the data structure. + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [r] Unique cubin id + */ + uint64_t cubinCrc; + /** + * [r] PC offset + */ + uint64_t pcOffset; + /** + * The function's unique symbol index in the module. + */ + uint32_t functionIndex; + /** + * Padding + */ + uint32_t pad; + /** + * [r] The function name. This name string might be shared across all the records + * including records from activity APIs representing the same function, and so it should not be + * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to + * free the memory using free() function. + */ + char* functionName; + /** + * [r] Collected stall reason count + */ + size_t stallReasonCount; + /** + * [r] Stall reason id + * Total samples + */ + CUpti_PCSamplingStallReason *stallReason; +} CUpti_PCSamplingPCData; + +/** + * \brief PC Sampling output data format + */ +typedef enum +{ + CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID = 0, + /** + * HW buffer data will be parsed during collection of data + */ + CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED = 1, +} CUpti_PCSamplingOutputDataFormat; + +/** + * \brief Collected PC Sampling data + * + */ +typedef struct PACKED_ALIGNMENT +{ + /** + * [w] Size of the data structure. + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Number of PCs to be collected + */ + size_t collectNumPcs; + /** + * [r] Number of samples collected across all PCs. + * It includes samples for user modules, samples for non-user kernels and dropped samples. + * It includes counts for all non selected stall reasons. + * CUPTI does not provide PC records for non-user kernels. + * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero. + */ + uint64_t totalSamples; + /** + * [r] Number of samples that were dropped by hardware due to backpressure/overflow. + */ + uint64_t droppedSamples; + /** + * [r] Number of PCs collected + */ + size_t totalNumPcs; + /** + * [r] Number of PCs available for collection + */ + size_t remainingNumPcs; + /** + * [r] Unique identifier for each range. + * Data collected across multiple ranges in multiple buffers can be identified using range id. + */ + uint64_t rangeId; + /** + * [r] Profiled PC data + * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs + */ + CUpti_PCSamplingPCData *pPcData; + /** + * [r] Number of samples collected across all non user kernels PCs. + * It includes samples for non-user kernels. + * It includes counts for all non selected stall reasons as well. + * CUPTI does not provide PC records for non-user kernels. + */ + uint64_t nonUsrKernelsTotalSamples; +} CUpti_PCSamplingData; + +/** + * \brief PC Sampling configuration attributes + * + * PC Sampling configuration attribute types. These attributes can be read + * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written + * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked + * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute + * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute + * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and + * written using \ref cuptiPCSamplingSetConfigurationAttribute + */ +typedef enum +{ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID = 0, + /** + * [rw] Sampling period for PC Sampling. + * DEFAULT - CUPTI defined value based on number of SMs + * Valid values for the sampling + * periods are between 5 to 31 both inclusive. This will set the + * sampling period to (2^samplingPeriod) cycles. + * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31 + * Value is a uint32_t + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD = 1, + /** + * [w] Number of stall reasons to collect. + * DEFAULT - All stall reasons will be collected + * Value is a size_t + * [w] Stall reasons to collect + * DEFAULT - All stall reasons will be collected + * Input value should be a pointer pointing to array of stall reason indexes + * containing all the stall reason indexes to collect. + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON = 2, + /** + * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer + * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs + * with all stall reasons + * Approximately it takes 16 Bytes (and some fixed size memory) + * to accommodate one PC with one stall reason + * For e.g. 1 PC with 1 stall reason = 32 Bytes + * 1 PC with 2 stall reason = 48 Bytes + * 1 PC with 4 stall reason = 96 Bytes + * Value is a size_t + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE = 3, + /** + * [rw] Size of HW buffer in bytes + * DEFAULT - 512 MB + * If sampling period is too less, HW buffer can overflow + * and drop PC data + * Value is a size_t + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE = 4, + /** + * [rw] PC Sampling collection mode + * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS + * Input value should be of type \ref CUpti_PCSamplingCollectionMode. + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE = 5, + /** + * [rw] Control over PC Sampling data collection range + * Default - 0 + * 1 - Allows user to start and stop PC Sampling using APIs - + * \ref cuptiPCSamplingStart() - Start PC Sampling + * \ref cuptiPCSamplingStop() - Stop PC Sampling + * Value is a uint32_t + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL = 6, + /** + * [w] Value for output data format + * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED + * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat. + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT = 7, + /** + * [w] Data buffer to hold collected PC Sampling data PARSED_DATA + * Default - none. + * Buffer type is void * which can point to PARSED_DATA + * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA + */ + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER = 8, + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT = 0x7fffffff, +} CUpti_PCSamplingConfigurationAttributeType; + +/** + * \brief PC sampling configuration information structure + * + * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured + * or queried for PC sampling configuration + */ +typedef struct +{ + /** + * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types + */ + CUpti_PCSamplingConfigurationAttributeType attributeType; + /* + * Configure or query status for \p attributeType + * CUPTI_SUCCESS for valid \p attributeType and \p attributeData + * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid + * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid + */ + CUptiResult attributeStatus; + union + { + /** + * Invalid Value + */ + struct + { + uint64_t data[3]; + } invalidData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD + */ + struct + { + uint32_t samplingPeriod; + } samplingPeriodData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON + */ + struct + { + size_t stallReasonCount; + uint32_t *pStallReasonIndex; + } stallReasonData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE + */ + struct + { + size_t scratchBufferSize; + } scratchBufferSizeData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE + */ + struct + { + size_t hardwareBufferSize; + } hardwareBufferSizeData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE + */ + struct + { + CUpti_PCSamplingCollectionMode collectionMode; + } collectionModeData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL + */ + struct + { + uint32_t enableStartStopControl; + } enableStartStopControlData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT + */ + struct + { + CUpti_PCSamplingOutputDataFormat outputDataFormat; + } outputDataFormatData; + /** + * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER + */ + struct + { + void *samplingDataBuffer; + } samplingDataBufferData; + } attributeData; +} CUpti_PCSamplingConfigurationInfo; + +/** + * \brief PC sampling configuration structure + * + * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute + * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; + /** + * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query + * using \ref cuptiPCSamplingGetConfigurationAttribute + */ + size_t numAttributes; + /** + * Refer \ref CUpti_PCSamplingConfigurationInfo + */ + CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo; +} CUpti_PCSamplingConfigurationInfoParams; +#define CUpti_PCSamplingConfigurationInfoParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo) + +/** + * \brief Write PC Sampling configuration attribute. + * + * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams + * containing PC sampling configuration. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with + * some invalid \p attrib. + * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid + * or any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams); + +/** + * \brief Read PC Sampling configuration attribute. + * + * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams + * containing PC sampling configuration. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with + * some invalid attribute. + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid + * or any \p pParams is not valid + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that + * the \p value buffer is too small to hold the attribute value + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams); + +/** + * \brief Params for cuptiPCSamplingEnable + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; + /** + * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA + * Buffer type is void * which can point to PARSED_DATA + * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA + */ + void *pcSamplingData; +} CUpti_PCSamplingGetDataParams; +#define CUpti_PCSamplingGetDataParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData) +/** + * \brief Flush GPU PC sampling data periodically. + * + * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs: + * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load + * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends + * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL + * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop() + * + * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled + * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload, + * user can collect data in two ways: + * Use \brief cuptiPCSamplingGetData() API periodically + * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling + * data buffer passed during configuration. + * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer + * passed during configuration should be large enough to hold all PCs data. + * \brief cuptiPCSamplingGetData() API never does device synchronization. + * It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case + * CUPTI provides only the data available with it at that moment. + * + * \param Refer \ref CUpti_PCSamplingGetDataParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without + * enabling PC sampling. + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams); + +/** + * \brief Params for cuptiPCSamplingEnable + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; +} CUpti_PCSamplingEnableParams; +#define CUpti_PCSamplingEnableParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx) + +/** + * \brief Enable PC sampling. + * + * \param Refer \ref CUpti_PCSamplingEnableParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams); + +/** + * \brief Params for cuptiPCSamplingDisable + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; +} CUpti_PCSamplingDisableParams; +#define CUpti_PCSamplingDisableParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx) + +/** + * \brief Disable PC sampling. + * + * For application which doesn't destroy the CUDA context explicitly, + * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided + * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded. + * + * \param Refer \ref CUpti_PCSamplingDisableParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams); + +/** + * \brief Params for cuptiPCSamplingStart + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; +} CUpti_PCSamplingStartParams; +#define CUpti_PCSamplingStartParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx) + +/** + * \brief Start PC sampling. + * + * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs. + * This API can be used to mark starting of range. Set configuration option + * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API. + * + * \param Refer \ref CUpti_PCSamplingStartParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with + * incorrect PC Sampling configuration. + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams); + +/** + * \brief Params for cuptiPCSamplingStop + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; +} CUpti_PCSamplingStopParams; +#define CUpti_PCSamplingStopParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx) + +/** + * \brief Stop PC sampling. + * + * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs. + * This API can be used to mark end of range. Set configuration option + * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API. + * + * \param Refer \ref CUpti_PCSamplingStopParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with + * incorrect PC Sampling configuration. + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams); + +/** + * \brief Params for cuptiPCSamplingGetNumStallReasons + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; + /** + * [r] Number of stall reasons + */ + size_t *numStallReasons; +} CUpti_PCSamplingGetNumStallReasonsParams; +#define CUpti_PCSamplingGetNumStallReasonsParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons) + +/** + * \brief Get PC sampling stall reason count. + * + * \param Refer \ref CUpti_PCSamplingGetNumStallReasonsParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams); + +/** + * \brief Params for cuptiPCSamplingGetStallReasons + */ +typedef struct +{ + /** + * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Assign to NULL + */ + void* pPriv; + /** + * [w] CUcontext + */ + CUcontext ctx; + /** + * [w] Number of stall reasons + */ + size_t numStallReasons; + /** + * [r] Stall reason index + */ + uint32_t *stallReasonIndex; + /** + * [r] Stall reasons name + */ + char **stallReasons; +} CUpti_PCSamplingGetStallReasonsParams; +#define CUpti_PCSamplingGetStallReasonsParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons) + +/** + * \brief Get PC sampling stall reasons. + * + * \param Refer \ref CUpti_PCSamplingGetStallReasonsParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid + * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device + * does not support the API + */ +CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams); + +/** + * \brief Params for cuptiGetSassToSourceCorrelation + */ +typedef struct { + /** + * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Pointer to cubin binary where function belongs. + */ + const void* cubin; + /** + * [w] Function name to which PC belongs. + */ + const char *functionName; + /** + * [w] Size of cubin binary. + */ + size_t cubinSize; + /** + * [r] Line number in the source code. + */ + uint32_t lineNumber; + /** + * [w] PC offset + */ + uint64_t pcOffset; + /** + * [r] Path for the source file. + */ + char *fileName; + /** + * [r] Path for the directory of source file. + */ + char *dirName; +} CUpti_GetSassToSourceCorrelationParams; +#define CUpti_GetSassToSourceCorrelationParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName) + +/** + * \brief SASS to Source correlation. + * + * \param Refer \ref CUpti_GetSassToSourceCorrelationParams + * + * It is expected from user to free allocated memory for fileName and dirName after use. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName + * is NULL or cubinSize is zero or size field is not set correctly. + * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid. + * \retval CUPTI_ERROR_UNKNOWN an internal error occurred. + * This error code is also used for cases when the function is not present in the module. + * A better error code will be returned in the future release. + */ +CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams); + +/** + * \brief Params for cuptiGetCubinCrc + */ +typedef struct { + /** + * [w] Size of configuration structure. + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * [w] Size of cubin binary. + */ + size_t cubinSize; + /** + * [w] Pointer to cubin binary + */ + const void* cubin; + /** + * [r] Computed CRC will be stored in it. + */ + uint64_t cubinCrc; +} CUpti_GetCubinCrcParams; +#define CUpti_GetCubinCrcParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc) + +/** + * \brief Get the CRC of cubin. + * + * This function returns the CRC of provided cubin binary. + * + * \param Refer \ref CUpti_GetCubinCrcParams + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or + * provided cubinSize is zero or size field is not set. + */ +CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams); + +/** + * \brief Function type for callback used by CUPTI to request crc of + * loaded module. + * + * This callback function ask for crc of provided module in function. + * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling + * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module. + * + * \param cubin The pointer to cubin binary + * \param cubinSize The size of cubin binary. + * \param cubinCrc Returns the computed crc of cubin. + */ +typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)( + const void* cubin, + size_t cubinSize, + uint64_t *cubinCrc); + +/** + * \brief Register callback function with CUPTI to use + * your own algorithm to compute cubin crc. + * + * This function registers a callback function and it gets called + * from CUPTI when a CUDA module is loaded. + * + * \param funcComputeCubinCrc callback is invoked when a CUDA module + * is loaded. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL. + */ +CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc); + +/** @} */ /* END CUPTI_PCSAMPLING_API */ + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_PCSAMPLING_H_*/ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86372ad53bf0e76bca82e189915b4989d29e2180 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h new file mode 100644 index 0000000000000000000000000000000000000000..e24cfcbba4d93b57f15a4bd60fbe60a99b493f66 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer_v8.h @@ -0,0 +1,571 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_cnn_infer : cuDNN's basic definitions and inference CNN functions. + */ + +#if !defined(CUDNN_CNN_INFER_H_) +#define CUDNN_CNN_INFER_H_ + +#pragma once +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_CNN_INFER_MAJOR 8 +#define CUDNN_CNN_INFER_MINOR 7 +#define CUDNN_CNN_INFER_PATCH 0 + +#if (CUDNN_CNN_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_INFER_MINOR != CUDNN_MINOR) || \ + (CUDNN_CNN_INFER_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN CNN INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t; + +/* + * convolution mode + */ +typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t; + +/* + * CUDNN Reorder + */ +typedef enum { + CUDNN_DEFAULT_REORDER = 0, + CUDNN_NO_REORDER = 1, +} cudnnReorderType_t; + +typedef struct cudnnConvolutionFwdAlgoPerfStruct { + cudnnConvolutionFwdAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionFwdAlgoPerf_t; + +/* Create an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc); + +/* Destroy an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height */ + int pad_w, /* zero-padding width */ + int u, /* vertical filter stride */ + int v, /* horizontal filter stride */ + int dilation_h, /* filter dilation in the vertical dimension */ + int dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int *pad_h, /* zero-padding height */ + int *pad_w, /* zero-padding width */ + int *u, /* vertical filter stride */ + int *v, /* horizontal filter stride */ + int *dilation_h, /* filter dilation in the vertical dimension */ + int *dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, + int arrayLength, /* nbDims-2 size */ + const int padA[], + const int filterStrideA[], + const int dilationA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); /* convolution data type */ + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int arrayLengthRequested, + int *arrayLength, + int padA[], + int strideA[], + int dilationA[], + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); /* convolution data type */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w); + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int nbDims, + int tensorOuputDimA[]); + +/* helper function to provide the convolution forward algo that fit best the requirement */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnIm2Col(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + void *colBuffer); + +cudnnStatus_t CUDNNWINAPI +cudnnReorderFilterAndBias(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + cudnnReorderType_t reorderType, + const void *filterData, + void *reorderedFilterData, + int reorderBias, + const void *biasData, + void *reorderedBiasData); + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes); + +/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform the forward pass for batch convolution */ +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionForward(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBiasActivationForward(cudnnHandle_t handle, + const void *alpha1, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *alpha2, + const cudnnTensorDescriptor_t zDesc, + const void *z, + const cudnnTensorDescriptor_t biasDesc, + const void *bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* helper function to provide the convolution backward data algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdDataAlgoPerfStruct { + cudnnConvolutionBwdDataAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdDataAlgoPerf_t; + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardData(cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Helper function to calculate folding descriptors for dgrad */ +cudnnStatus_t CUDNNWINAPI +cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const cudnnTensorFormat_t transformFormat, + cudnnFilterDescriptor_t foldedFilterDesc, + cudnnTensorDescriptor_t paddedDiffDesc, + cudnnConvolutionDescriptor_t foldedConvDesc, + cudnnTensorDescriptor_t foldedGradDesc, + cudnnTensorTransformDescriptor_t filterFoldTransDesc, + cudnnTensorTransformDescriptor_t diffPadTransDesc, + cudnnTensorTransformDescriptor_t gradFoldTransDesc, + cudnnTensorTransformDescriptor_t gradUnfoldTransDesc); + +/* cudnnFusedOps... */ +struct cudnnFusedOpsConstParamStruct; +typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t; + +struct cudnnFusedOpsVariantParamStruct; +typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t; + +struct cudnnFusedOpsPlanStruct; +typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t; + +typedef enum { + /* each op in [ ] can be disabled by passing NULL ptr */ + /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0, + /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1, + /* utility for BN training in BN-conv fusion */ + /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */ + /* optionally update running stats and generate saved stats */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2, + /* utility for BN inference in BN-conv fusion */ + /* computes the equivalent scale and bias from learned running stats and learned scale, bias */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3, + /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */ + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4, + /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */ + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5, + /* reserved for future use */ + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6, +} cudnnFusedOps_t; + +typedef enum { + /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get XDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_XDESC = 0, + /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_XDATA_PLACEHOLDER = 1, + /* set/get BN_MODE: pass cudnnBatchNormMode_t* */ + CUDNN_PARAM_BN_MODE = 2, + /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3, + /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4, + /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5, + /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */ + /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */ + CUDNN_PARAM_ACTIVATION_DESC = 6, + /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */ + /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */ + CUDNN_PARAM_CONV_DESC = 7, + /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get WDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_WDESC = 8, + /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_WDATA_PLACEHOLDER = 9, + /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get DWDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_DWDESC = 10, + /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11, + /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YDESC = 12, + /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YDATA_PLACEHOLDER = 13, + /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DYDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DYDESC = 14, + /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15, + /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YSTATS_DESC = 16, + /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSUM_PLACEHOLDER = 17, + /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18, + /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19, + /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20, + /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21, + /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22, + /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23, + /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24, + /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25, + + /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ZDESC = 26, + /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27, + /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28, + /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29, + /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30, + + /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31, + /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32, + + /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DXDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DXDESC = 33, + /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34, + /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DZDESC = 35, + /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36, + /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37, + /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38, +} cudnnFusedOpsConstParamLabel_t; + +typedef enum { + CUDNN_PTR_NULL = 0, + CUDNN_PTR_ELEM_ALIGNED = 1, + CUDNN_PTR_16B_ALIGNED = 2, +} cudnnFusedOpsPointerPlaceHolder_t; + +typedef enum { + /* set: pass void* pointing to dev memory */ + /* get: pass void** pointing to host memory */ + CUDNN_PTR_XDATA = 0, + CUDNN_PTR_BN_EQSCALE = 1, + CUDNN_PTR_BN_EQBIAS = 2, + CUDNN_PTR_WDATA = 3, + CUDNN_PTR_DWDATA = 4, + CUDNN_PTR_YDATA = 5, + CUDNN_PTR_DYDATA = 6, + CUDNN_PTR_YSUM = 7, + CUDNN_PTR_YSQSUM = 8, + CUDNN_PTR_WORKSPACE = 9, + CUDNN_PTR_BN_SCALE = 10, + CUDNN_PTR_BN_BIAS = 11, + CUDNN_PTR_BN_SAVED_MEAN = 12, + CUDNN_PTR_BN_SAVED_INVSTD = 13, + CUDNN_PTR_BN_RUNNING_MEAN = 14, + CUDNN_PTR_BN_RUNNING_VAR = 15, + CUDNN_PTR_ZDATA = 16, + CUDNN_PTR_BN_Z_EQSCALE = 17, + CUDNN_PTR_BN_Z_EQBIAS = 18, + CUDNN_PTR_ACTIVATION_BITMASK = 19, + CUDNN_PTR_DXDATA = 20, + CUDNN_PTR_DZDATA = 21, + CUDNN_PTR_BN_DSCALE = 22, + CUDNN_PTR_BN_DBIAS = 23, + + /* set/get: pass size_t* pointing to host memory */ + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100, + /* set/get: pass int64_t* pointing to host memory */ + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103, +} cudnnFusedOpsVariantParamLabel_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCnnInferVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_CNN_INFER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h new file mode 100644 index 0000000000000000000000000000000000000000..a6ff223dbf1791512913b378c42f3695cf9bb86a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version.h @@ -0,0 +1,70 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/** + * \file: The master cuDNN version file. + */ + +#ifndef CUDNN_VERSION_H_ +#define CUDNN_VERSION_H_ + +#define CUDNN_MAJOR 8 +#define CUDNN_MINOR 7 +#define CUDNN_PATCHLEVEL 0 + +#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) + +/* cannot use constexpr here since this is a C-only file */ +/* Below is the max SM version this cuDNN library is aware of and supports natively */ + +#define CUDNN_MAX_SM_MAJOR_NUMBER 9 +#define CUDNN_MAX_SM_MINOR_NUMBER 0 +#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100) + (CUDNN_MAX_SM_MINOR_NUMBER * 10) + +#endif /* CUDNN_VERSION_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..311087ab956a7e5226e441587728ed45a3ee2e03 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h new file mode 100644 index 0000000000000000000000000000000000000000..94fcf4745fafa04f57678ba5ee64103f8ebd6444 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h @@ -0,0 +1,97 @@ + /* Copyright 2013,2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/*! +* \file cudalibxt.h +* \brief Public header file for the NVIDIA library multi-GPU support structures +*/ + +#ifndef _CUDA_LIB_XT_H_ +#define _CUDA_LIB_XT_H_ +#include + +#define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION + +enum cudaXtCopyType_t { + LIB_XT_COPY_HOST_TO_DEVICE, + LIB_XT_COPY_DEVICE_TO_HOST, + LIB_XT_COPY_DEVICE_TO_DEVICE +} ; +typedef enum cudaXtCopyType_t cudaLibXtCopyType; + +enum libFormat_t { + LIB_FORMAT_CUFFT = 0x0, + LIB_FORMAT_UNDEFINED = 0x1 +}; + +typedef enum libFormat_t libFormat; + +#define MAX_CUDA_DESCRIPTOR_GPUS 64 + +struct cudaXtDesc_t{ + int version; //descriptor version + int nGPUs; //number of GPUs + int GPUs[MAX_CUDA_DESCRIPTOR_GPUS]; //array of device IDs + void *data[MAX_CUDA_DESCRIPTOR_GPUS]; //array of pointers to data, one per GPU + size_t size[MAX_CUDA_DESCRIPTOR_GPUS]; //array of data sizes, one per GPU + void *cudaXtState; //opaque CUDA utility structure +}; +typedef struct cudaXtDesc_t cudaXtDesc; + +struct cudaLibXtDesc_t{ + int version; //descriptor version + cudaXtDesc *descriptor; //multi-GPU memory descriptor + libFormat library; //which library recognizes the format + int subFormat; //library specific enumerator of sub formats + void *libDescriptor; //library specific descriptor e.g. FFT transform plan object +}; +typedef struct cudaLibXtDesc_t cudaLibXtDesc; + + +#endif + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h new file mode 100644 index 0000000000000000000000000000000000000000..511f5c7445d2f5f4bf9b84ebd766099b41837627 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h @@ -0,0 +1,269 @@ + + /* Copyright 2005-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/*! +* \file cufftXt.h +* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT) +*/ + +#ifndef _CUFFTXT_H_ +#define _CUFFTXT_H_ +#include "cudalibxt.h" +#include "cufft.h" + + +#ifndef CUFFTAPI +#ifdef _WIN32 +#define CUFFTAPI __stdcall +#else +#define CUFFTAPI +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// +// cufftXtSubFormat identifies the data layout of +// a memory descriptor owned by cufft. +// note that multi GPU cufft does not yet support out-of-place transforms +// + +typedef enum cufftXtSubFormat_t { + CUFFT_XT_FORMAT_INPUT = 0x00, //by default input is in linear order across GPUs + CUFFT_XT_FORMAT_OUTPUT = 0x01, //by default output is in scrambled order depending on transform + CUFFT_XT_FORMAT_INPLACE = 0x02, //by default inplace is input order, which is linear across GPUs + CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03, //shuffled output order after execution of the transform + CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04, //shuffled input order prior to execution of 1D transforms + CUFFT_XT_FORMAT_DISTRIBUTED_INPUT = 0x05, + CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT = 0x06, + CUFFT_FORMAT_UNDEFINED = 0x07 +} cufftXtSubFormat; + +// +// cufftXtCopyType specifies the type of copy for cufftXtMemcpy +// +typedef enum cufftXtCopyType_t { + CUFFT_COPY_HOST_TO_DEVICE = 0x00, + CUFFT_COPY_DEVICE_TO_HOST = 0x01, + CUFFT_COPY_DEVICE_TO_DEVICE = 0x02, + CUFFT_COPY_UNDEFINED = 0x03 +} cufftXtCopyType; + +// +// cufftXtQueryType specifies the type of query for cufftXtQueryPlan +// +typedef enum cufftXtQueryType_t { + CUFFT_QUERY_1D_FACTORS = 0x00, + CUFFT_QUERY_UNDEFINED = 0x01 +} cufftXtQueryType; + +typedef struct cufftXt1dFactors_t { + long long int size; + long long int stringCount; + long long int stringLength; + long long int substringLength; + long long int factor1; + long long int factor2; + long long int stringMask; + long long int substringMask; + long long int factor1Mask; + long long int factor2Mask; + int stringShift; + int substringShift; + int factor1Shift; + int factor2Shift; +} cufftXt1dFactors; + +// +// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy +// +typedef enum cufftXtWorkAreaPolicy_t { + CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */ + CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */ + CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */ +} cufftXtWorkAreaPolicy; + +// multi-GPU routines +cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs); + +cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan, + cudaLibXtDesc ** descriptor, + cufftXtSubFormat format); + +cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan, + void *dstPointer, + void *srcPointer, + cufftXtCopyType type); + +cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor); + +cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea); + +cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output, + int direction); + +cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output); + +cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output); + +cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output, + int direction); + +cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output); + +cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output); + +// Utility functions + +cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType); + + +// callbacks + + +typedef enum cufftXtCallbackType_t { + CUFFT_CB_LD_COMPLEX = 0x0, + CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1, + CUFFT_CB_LD_REAL = 0x2, + CUFFT_CB_LD_REAL_DOUBLE = 0x3, + CUFFT_CB_ST_COMPLEX = 0x4, + CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5, + CUFFT_CB_ST_REAL = 0x6, + CUFFT_CB_ST_REAL_DOUBLE = 0x7, + CUFFT_CB_UNDEFINED = 0x8 + +} cufftXtCallbackType; + +typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); +typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); +typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); +typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); + +typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer); +typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer); +typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer); +typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer); + + +cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info); +cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType); +cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize); + +cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan, + int rank, + long long int *n, + long long int *inembed, + long long int istride, + long long int idist, + cudaDataType inputtype, + long long int *onembed, + long long int ostride, + long long int odist, + cudaDataType outputtype, + long long int batch, + size_t *workSize, + cudaDataType executiontype); + +cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan, + int rank, + long long int *n, + long long int *inembed, + long long int istride, + long long int idist, + cudaDataType inputtype, + long long int *onembed, + long long int ostride, + long long int odist, + cudaDataType outputtype, + long long int batch, + size_t *workSize, + cudaDataType executiontype); + + +cufftResult CUFFTAPI cufftXtExec(cufftHandle plan, + void *input, + void *output, + int direction); + +cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan, + cudaLibXtDesc *input, + cudaLibXtDesc *output, + int direction); + +cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize); + +typedef struct cufftBox3d_t { + size_t lower[3]; + size_t upper[3]; + size_t strides[3]; +} cufftBox3d; + +cufftResult CUFFTAPI cufftXtSetDistribution(cufftHandle plan, + const cufftBox3d *box_in, + const cufftBox3d *box_out); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..b33560afbd4186ef1f1cab5be9f46b014e6bba0a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/METADATA @@ -0,0 +1,35 @@ +Metadata-Version: 2.1 +Name: nvidia-nvtx-cu11 +Version: 11.8.86 +Summary: NVIDIA Tools Extension +Home-page: https://developer.nvidia.com/cuda-zone +Author: Nvidia CUDA Installer Team +Author-email: cuda_installer@nvidia.com +License: NVIDIA Proprietary Software +Keywords: cuda,nvidia,runtime,machine learning,deep learning +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Education +Classifier: Intended Audience :: Science/Research +Classifier: License :: Other/Proprietary License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Topic :: Scientific/Engineering +Classifier: Topic :: Scientific/Engineering :: Mathematics +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Classifier: Topic :: Software Development +Classifier: Topic :: Software Development :: Libraries +Classifier: Operating System :: Microsoft :: Windows +Classifier: Operating System :: POSIX :: Linux +Requires-Python: >=3 +License-File: License.txt + +A C-based API for annotating events, code ranges, and resources in your applications. Applications which integrate NVTX can use the Visual Profiler to capture and visualize these events and ranges. diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..06e355fe0e3ed7077903f119ae6928a17da8eb6f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_nvtx_cu11-11.8.86.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.37.1) +Root-Is-Purelib: true +Tag: py3-none-manylinux1_x86_64 + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf3ac7b3d5071040de6d0c23ea365a6b5f46d613 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/__pycache__/_elffile.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..c1238c06eab95f8c90c393383a703aa3b8c366a5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/_parser.py @@ -0,0 +1,354 @@ +"""Handwritten parser of dependency specifiers. + +The docstring for each __parse_* function contains EBNF-inspired grammar representing +the implementation. +""" + +from __future__ import annotations + +import ast +from typing import NamedTuple, Sequence, Tuple, Union + +from ._tokenizer import DEFAULT_RULES, Tokenizer + + +class Node: + def __init__(self, value: str) -> None: + self.value = value + + def __str__(self) -> str: + return self.value + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}('{self}')>" + + def serialize(self) -> str: + raise NotImplementedError + + +class Variable(Node): + def serialize(self) -> str: + return str(self) + + +class Value(Node): + def serialize(self) -> str: + return f'"{self}"' + + +class Op(Node): + def serialize(self) -> str: + return str(self) + + +MarkerVar = Union[Variable, Value] +MarkerItem = Tuple[MarkerVar, Op, MarkerVar] +MarkerAtom = Union[MarkerItem, Sequence["MarkerAtom"]] +MarkerList = Sequence[Union["MarkerList", MarkerAtom, str]] + + +class ParsedRequirement(NamedTuple): + name: str + url: str + extras: list[str] + specifier: str + marker: MarkerList | None + + +# -------------------------------------------------------------------------------------- +# Recursive descent parser for dependency specifier +# -------------------------------------------------------------------------------------- +def parse_requirement(source: str) -> ParsedRequirement: + return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) + + +def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement: + """ + requirement = WS? IDENTIFIER WS? extras WS? requirement_details + """ + tokenizer.consume("WS") + + name_token = tokenizer.expect( + "IDENTIFIER", expected="package name at the start of dependency specifier" + ) + name = name_token.text + tokenizer.consume("WS") + + extras = _parse_extras(tokenizer) + tokenizer.consume("WS") + + url, specifier, marker = _parse_requirement_details(tokenizer) + tokenizer.expect("END", expected="end of dependency specifier") + + return ParsedRequirement(name, url, extras, specifier, marker) + + +def _parse_requirement_details( + tokenizer: Tokenizer, +) -> tuple[str, str, MarkerList | None]: + """ + requirement_details = AT URL (WS requirement_marker?)? + | specifier WS? (requirement_marker)? + """ + + specifier = "" + url = "" + marker = None + + if tokenizer.check("AT"): + tokenizer.read() + tokenizer.consume("WS") + + url_start = tokenizer.position + url = tokenizer.expect("URL", expected="URL after @").text + if tokenizer.check("END", peek=True): + return (url, specifier, marker) + + tokenizer.expect("WS", expected="whitespace after URL") + + # The input might end after whitespace. + if tokenizer.check("END", peek=True): + return (url, specifier, marker) + + marker = _parse_requirement_marker( + tokenizer, span_start=url_start, after="URL and whitespace" + ) + else: + specifier_start = tokenizer.position + specifier = _parse_specifier(tokenizer) + tokenizer.consume("WS") + + if tokenizer.check("END", peek=True): + return (url, specifier, marker) + + marker = _parse_requirement_marker( + tokenizer, + span_start=specifier_start, + after=( + "version specifier" + if specifier + else "name and no valid version specifier" + ), + ) + + return (url, specifier, marker) + + +def _parse_requirement_marker( + tokenizer: Tokenizer, *, span_start: int, after: str +) -> MarkerList: + """ + requirement_marker = SEMICOLON marker WS? + """ + + if not tokenizer.check("SEMICOLON"): + tokenizer.raise_syntax_error( + f"Expected end or semicolon (after {after})", + span_start=span_start, + ) + tokenizer.read() + + marker = _parse_marker(tokenizer) + tokenizer.consume("WS") + + return marker + + +def _parse_extras(tokenizer: Tokenizer) -> list[str]: + """ + extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)? + """ + if not tokenizer.check("LEFT_BRACKET", peek=True): + return [] + + with tokenizer.enclosing_tokens( + "LEFT_BRACKET", + "RIGHT_BRACKET", + around="extras", + ): + tokenizer.consume("WS") + extras = _parse_extras_list(tokenizer) + tokenizer.consume("WS") + + return extras + + +def _parse_extras_list(tokenizer: Tokenizer) -> list[str]: + """ + extras_list = identifier (wsp* ',' wsp* identifier)* + """ + extras: list[str] = [] + + if not tokenizer.check("IDENTIFIER"): + return extras + + extras.append(tokenizer.read().text) + + while True: + tokenizer.consume("WS") + if tokenizer.check("IDENTIFIER", peek=True): + tokenizer.raise_syntax_error("Expected comma between extra names") + elif not tokenizer.check("COMMA"): + break + + tokenizer.read() + tokenizer.consume("WS") + + extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma") + extras.append(extra_token.text) + + return extras + + +def _parse_specifier(tokenizer: Tokenizer) -> str: + """ + specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS + | WS? version_many WS? + """ + with tokenizer.enclosing_tokens( + "LEFT_PARENTHESIS", + "RIGHT_PARENTHESIS", + around="version specifier", + ): + tokenizer.consume("WS") + parsed_specifiers = _parse_version_many(tokenizer) + tokenizer.consume("WS") + + return parsed_specifiers + + +def _parse_version_many(tokenizer: Tokenizer) -> str: + """ + version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)? + """ + parsed_specifiers = "" + while tokenizer.check("SPECIFIER"): + span_start = tokenizer.position + parsed_specifiers += tokenizer.read().text + if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True): + tokenizer.raise_syntax_error( + ".* suffix can only be used with `==` or `!=` operators", + span_start=span_start, + span_end=tokenizer.position + 1, + ) + if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True): + tokenizer.raise_syntax_error( + "Local version label can only be used with `==` or `!=` operators", + span_start=span_start, + span_end=tokenizer.position, + ) + tokenizer.consume("WS") + if not tokenizer.check("COMMA"): + break + parsed_specifiers += tokenizer.read().text + tokenizer.consume("WS") + + return parsed_specifiers + + +# -------------------------------------------------------------------------------------- +# Recursive descent parser for marker expression +# -------------------------------------------------------------------------------------- +def parse_marker(source: str) -> MarkerList: + return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES)) + + +def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList: + retval = _parse_marker(tokenizer) + tokenizer.expect("END", expected="end of marker expression") + return retval + + +def _parse_marker(tokenizer: Tokenizer) -> MarkerList: + """ + marker = marker_atom (BOOLOP marker_atom)+ + """ + expression = [_parse_marker_atom(tokenizer)] + while tokenizer.check("BOOLOP"): + token = tokenizer.read() + expr_right = _parse_marker_atom(tokenizer) + expression.extend((token.text, expr_right)) + return expression + + +def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: + """ + marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS? + | WS? marker_item WS? + """ + + tokenizer.consume("WS") + if tokenizer.check("LEFT_PARENTHESIS", peek=True): + with tokenizer.enclosing_tokens( + "LEFT_PARENTHESIS", + "RIGHT_PARENTHESIS", + around="marker expression", + ): + tokenizer.consume("WS") + marker: MarkerAtom = _parse_marker(tokenizer) + tokenizer.consume("WS") + else: + marker = _parse_marker_item(tokenizer) + tokenizer.consume("WS") + return marker + + +def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem: + """ + marker_item = WS? marker_var WS? marker_op WS? marker_var WS? + """ + tokenizer.consume("WS") + marker_var_left = _parse_marker_var(tokenizer) + tokenizer.consume("WS") + marker_op = _parse_marker_op(tokenizer) + tokenizer.consume("WS") + marker_var_right = _parse_marker_var(tokenizer) + tokenizer.consume("WS") + return (marker_var_left, marker_op, marker_var_right) + + +def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar: + """ + marker_var = VARIABLE | QUOTED_STRING + """ + if tokenizer.check("VARIABLE"): + return process_env_var(tokenizer.read().text.replace(".", "_")) + elif tokenizer.check("QUOTED_STRING"): + return process_python_str(tokenizer.read().text) + else: + tokenizer.raise_syntax_error( + message="Expected a marker variable or quoted string" + ) + + +def process_env_var(env_var: str) -> Variable: + if env_var in ("platform_python_implementation", "python_implementation"): + return Variable("platform_python_implementation") + else: + return Variable(env_var) + + +def process_python_str(python_str: str) -> Value: + value = ast.literal_eval(python_str) + return Value(str(value)) + + +def _parse_marker_op(tokenizer: Tokenizer) -> Op: + """ + marker_op = IN | NOT IN | OP + """ + if tokenizer.check("IN"): + tokenizer.read() + return Op("in") + elif tokenizer.check("NOT"): + tokenizer.read() + tokenizer.expect("WS", expected="whitespace after 'not'") + tokenizer.expect("IN", expected="'in' after 'not'") + return Op("not in") + elif tokenizer.check("OP"): + return Op(tokenizer.read().text) + else: + return tokenizer.raise_syntax_error( + "Expected marker operator, one of " + "<=, <, !=, ==, >=, >, ~=, ===, in, not in" + ) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7f49cf8cd43ffae71e3e8d15174d7536f9da02 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/markers.py @@ -0,0 +1,331 @@ +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. See the LICENSE file in the root of this repository +# for complete details. + +from __future__ import annotations + +import operator +import os +import platform +import sys +from typing import Any, Callable, TypedDict, cast + +from ._parser import MarkerAtom, MarkerList, Op, Value, Variable +from ._parser import parse_marker as _parse_marker +from ._tokenizer import ParserSyntaxError +from .specifiers import InvalidSpecifier, Specifier +from .utils import canonicalize_name + +__all__ = [ + "InvalidMarker", + "Marker", + "UndefinedComparison", + "UndefinedEnvironmentName", + "default_environment", +] + +Operator = Callable[[str, str], bool] + + +class InvalidMarker(ValueError): + """ + An invalid marker was found, users should refer to PEP 508. + """ + + +class UndefinedComparison(ValueError): + """ + An invalid operation was attempted on a value that doesn't support it. + """ + + +class UndefinedEnvironmentName(ValueError): + """ + A name was attempted to be used that does not exist inside of the + environment. + """ + + +class Environment(TypedDict): + implementation_name: str + """The implementation's identifier, e.g. ``'cpython'``.""" + + implementation_version: str + """ + The implementation's version, e.g. ``'3.13.0a2'`` for CPython 3.13.0a2, or + ``'7.3.13'`` for PyPy3.10 v7.3.13. + """ + + os_name: str + """ + The value of :py:data:`os.name`. The name of the operating system dependent module + imported, e.g. ``'posix'``. + """ + + platform_machine: str + """ + Returns the machine type, e.g. ``'i386'``. + + An empty string if the value cannot be determined. + """ + + platform_release: str + """ + The system's release, e.g. ``'2.2.0'`` or ``'NT'``. + + An empty string if the value cannot be determined. + """ + + platform_system: str + """ + The system/OS name, e.g. ``'Linux'``, ``'Windows'`` or ``'Java'``. + + An empty string if the value cannot be determined. + """ + + platform_version: str + """ + The system's release version, e.g. ``'#3 on degas'``. + + An empty string if the value cannot be determined. + """ + + python_full_version: str + """ + The Python version as string ``'major.minor.patchlevel'``. + + Note that unlike the Python :py:data:`sys.version`, this value will always include + the patchlevel (it defaults to 0). + """ + + platform_python_implementation: str + """ + A string identifying the Python implementation, e.g. ``'CPython'``. + """ + + python_version: str + """The Python version as string ``'major.minor'``.""" + + sys_platform: str + """ + This string contains a platform identifier that can be used to append + platform-specific components to :py:data:`sys.path`, for instance. + + For Unix systems, except on Linux and AIX, this is the lowercased OS name as + returned by ``uname -s`` with the first part of the version as returned by + ``uname -r`` appended, e.g. ``'sunos5'`` or ``'freebsd8'``, at the time when Python + was built. + """ + + +def _normalize_extra_values(results: Any) -> Any: + """ + Normalize extra values. + """ + if isinstance(results[0], tuple): + lhs, op, rhs = results[0] + if isinstance(lhs, Variable) and lhs.value == "extra": + normalized_extra = canonicalize_name(rhs.value) + rhs = Value(normalized_extra) + elif isinstance(rhs, Variable) and rhs.value == "extra": + normalized_extra = canonicalize_name(lhs.value) + lhs = Value(normalized_extra) + results[0] = lhs, op, rhs + return results + + +def _format_marker( + marker: list[str] | MarkerAtom | str, first: bool | None = True +) -> str: + assert isinstance(marker, (list, tuple, str)) + + # Sometimes we have a structure like [[...]] which is a single item list + # where the single item is itself it's own list. In that case we want skip + # the rest of this function so that we don't get extraneous () on the + # outside. + if ( + isinstance(marker, list) + and len(marker) == 1 + and isinstance(marker[0], (list, tuple)) + ): + return _format_marker(marker[0]) + + if isinstance(marker, list): + inner = (_format_marker(m, first=False) for m in marker) + if first: + return " ".join(inner) + else: + return "(" + " ".join(inner) + ")" + elif isinstance(marker, tuple): + return " ".join([m.serialize() for m in marker]) + else: + return marker + + +_operators: dict[str, Operator] = { + "in": lambda lhs, rhs: lhs in rhs, + "not in": lambda lhs, rhs: lhs not in rhs, + "<": operator.lt, + "<=": operator.le, + "==": operator.eq, + "!=": operator.ne, + ">=": operator.ge, + ">": operator.gt, +} + + +def _eval_op(lhs: str, op: Op, rhs: str) -> bool: + try: + spec = Specifier("".join([op.serialize(), rhs])) + except InvalidSpecifier: + pass + else: + return spec.contains(lhs, prereleases=True) + + oper: Operator | None = _operators.get(op.serialize()) + if oper is None: + raise UndefinedComparison(f"Undefined {op!r} on {lhs!r} and {rhs!r}.") + + return oper(lhs, rhs) + + +def _normalize(*values: str, key: str) -> tuple[str, ...]: + # PEP 685 – Comparison of extra names for optional distribution dependencies + # https://peps.python.org/pep-0685/ + # > When comparing extra names, tools MUST normalize the names being + # > compared using the semantics outlined in PEP 503 for names + if key == "extra": + return tuple(canonicalize_name(v) for v in values) + + # other environment markers don't have such standards + return values + + +def _evaluate_markers(markers: MarkerList, environment: dict[str, str]) -> bool: + groups: list[list[bool]] = [[]] + + for marker in markers: + assert isinstance(marker, (list, tuple, str)) + + if isinstance(marker, list): + groups[-1].append(_evaluate_markers(marker, environment)) + elif isinstance(marker, tuple): + lhs, op, rhs = marker + + if isinstance(lhs, Variable): + environment_key = lhs.value + lhs_value = environment[environment_key] + rhs_value = rhs.value + else: + lhs_value = lhs.value + environment_key = rhs.value + rhs_value = environment[environment_key] + + lhs_value, rhs_value = _normalize(lhs_value, rhs_value, key=environment_key) + groups[-1].append(_eval_op(lhs_value, op, rhs_value)) + else: + assert marker in ["and", "or"] + if marker == "or": + groups.append([]) + + return any(all(item) for item in groups) + + +def format_full_version(info: sys._version_info) -> str: + version = f"{info.major}.{info.minor}.{info.micro}" + kind = info.releaselevel + if kind != "final": + version += kind[0] + str(info.serial) + return version + + +def default_environment() -> Environment: + iver = format_full_version(sys.implementation.version) + implementation_name = sys.implementation.name + return { + "implementation_name": implementation_name, + "implementation_version": iver, + "os_name": os.name, + "platform_machine": platform.machine(), + "platform_release": platform.release(), + "platform_system": platform.system(), + "platform_version": platform.version(), + "python_full_version": platform.python_version(), + "platform_python_implementation": platform.python_implementation(), + "python_version": ".".join(platform.python_version_tuple()[:2]), + "sys_platform": sys.platform, + } + + +class Marker: + def __init__(self, marker: str) -> None: + # Note: We create a Marker object without calling this constructor in + # packaging.requirements.Requirement. If any additional logic is + # added here, make sure to mirror/adapt Requirement. + try: + self._markers = _normalize_extra_values(_parse_marker(marker)) + # The attribute `_markers` can be described in terms of a recursive type: + # MarkerList = List[Union[Tuple[Node, ...], str, MarkerList]] + # + # For example, the following expression: + # python_version > "3.6" or (python_version == "3.6" and os_name == "unix") + # + # is parsed into: + # [ + # (, ')>, ), + # 'and', + # [ + # (, , ), + # 'or', + # (, , ) + # ] + # ] + except ParserSyntaxError as e: + raise InvalidMarker(str(e)) from e + + def __str__(self) -> str: + return _format_marker(self._markers) + + def __repr__(self) -> str: + return f"" + + def __hash__(self) -> int: + return hash((self.__class__.__name__, str(self))) + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, Marker): + return NotImplemented + + return str(self) == str(other) + + def evaluate(self, environment: dict[str, str] | None = None) -> bool: + """Evaluate a marker. + + Return the boolean from evaluating the given marker against the + environment. environment is an optional argument to override all or + part of the determined environment. + + The environment is determined from the current Python process. + """ + current_environment = cast("dict[str, str]", default_environment()) + current_environment["extra"] = "" + if environment is not None: + current_environment.update(environment) + # The API used to allow setting extra to None. We need to handle this + # case for backwards compatibility. + if current_environment["extra"] is None: + current_environment["extra"] = "" + + return _evaluate_markers( + self._markers, _repair_python_full_version(current_environment) + ) + + +def _repair_python_full_version(env: dict[str, str]) -> dict[str, str]: + """ + Work around platform.python_version() returning something that is not PEP 440 + compliant for non-tagged Python builds. + """ + if env["python_full_version"].endswith("+"): + env["python_full_version"] += "local" + return env diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..721f411cfc44f6d24c13112e4246b5ad776a5e0b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/metadata.py @@ -0,0 +1,863 @@ +from __future__ import annotations + +import email.feedparser +import email.header +import email.message +import email.parser +import email.policy +import pathlib +import sys +import typing +from typing import ( + Any, + Callable, + Generic, + Literal, + TypedDict, + cast, +) + +from . import licenses, requirements, specifiers, utils +from . import version as version_module +from .licenses import NormalizedLicenseExpression + +T = typing.TypeVar("T") + + +if sys.version_info >= (3, 11): # pragma: no cover + ExceptionGroup = ExceptionGroup +else: # pragma: no cover + + class ExceptionGroup(Exception): + """A minimal implementation of :external:exc:`ExceptionGroup` from Python 3.11. + + If :external:exc:`ExceptionGroup` is already defined by Python itself, + that version is used instead. + """ + + message: str + exceptions: list[Exception] + + def __init__(self, message: str, exceptions: list[Exception]) -> None: + self.message = message + self.exceptions = exceptions + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.message!r}, {self.exceptions!r})" + + +class InvalidMetadata(ValueError): + """A metadata field contains invalid data.""" + + field: str + """The name of the field that contains invalid data.""" + + def __init__(self, field: str, message: str) -> None: + self.field = field + super().__init__(message) + + +# The RawMetadata class attempts to make as few assumptions about the underlying +# serialization formats as possible. The idea is that as long as a serialization +# formats offer some very basic primitives in *some* way then we can support +# serializing to and from that format. +class RawMetadata(TypedDict, total=False): + """A dictionary of raw core metadata. + + Each field in core metadata maps to a key of this dictionary (when data is + provided). The key is lower-case and underscores are used instead of dashes + compared to the equivalent core metadata field. Any core metadata field that + can be specified multiple times or can hold multiple values in a single + field have a key with a plural name. See :class:`Metadata` whose attributes + match the keys of this dictionary. + + Core metadata fields that can be specified multiple times are stored as a + list or dict depending on which is appropriate for the field. Any fields + which hold multiple values in a single field are stored as a list. + + """ + + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: str + platforms: list[str] + summary: str + description: str + keywords: list[str] + home_page: str + author: str + author_email: str + license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: list[str] + download_url: str + classifiers: list[str] + requires: list[str] + provides: list[str] + obsoletes: list[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str + requires_dist: list[str] + provides_dist: list[str] + obsoletes_dist: list[str] + requires_python: str + requires_external: list[str] + project_urls: dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emitting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. + + # Metadata 2.1 - PEP 566 + description_content_type: str + provides_extra: list[str] + + # Metadata 2.2 - PEP 643 + dynamic: list[str] + + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. + + # Metadata 2.4 - PEP 639 + license_expression: str + license_files: list[str] + + +_STRING_FIELDS = { + "author", + "author_email", + "description", + "description_content_type", + "download_url", + "home_page", + "license", + "license_expression", + "maintainer", + "maintainer_email", + "metadata_version", + "name", + "requires_python", + "summary", + "version", +} + +_LIST_FIELDS = { + "classifiers", + "dynamic", + "license_files", + "obsoletes", + "obsoletes_dist", + "platforms", + "provides", + "provides_dist", + "provides_extra", + "requires", + "requires_dist", + "requires_external", + "supported_platforms", +} + +_DICT_FIELDS = { + "project_urls", +} + + +def _parse_keywords(data: str) -> list[str]: + """Split a string of comma-separated keywords into a list of keywords.""" + return [k.strip() for k in data.split(",")] + + +def _parse_project_urls(data: list[str]) -> dict[str, str]: + """Parse a list of label/URL string pairings separated by a comma.""" + urls = {} + for pair in data: + # Our logic is slightly tricky here as we want to try and do + # *something* reasonable with malformed data. + # + # The main thing that we have to worry about, is data that does + # not have a ',' at all to split the label from the Value. There + # isn't a singular right answer here, and we will fail validation + # later on (if the caller is validating) so it doesn't *really* + # matter, but since the missing value has to be an empty str + # and our return value is dict[str, str], if we let the key + # be the missing value, then they'd have multiple '' values that + # overwrite each other in a accumulating dict. + # + # The other potentional issue is that it's possible to have the + # same label multiple times in the metadata, with no solid "right" + # answer with what to do in that case. As such, we'll do the only + # thing we can, which is treat the field as unparseable and add it + # to our list of unparsed fields. + parts = [p.strip() for p in pair.split(",", 1)] + parts.extend([""] * (max(0, 2 - len(parts)))) # Ensure 2 items + + # TODO: The spec doesn't say anything about if the keys should be + # considered case sensitive or not... logically they should + # be case-preserving and case-insensitive, but doing that + # would open up more cases where we might have duplicate + # entries. + label, url = parts + if label in urls: + # The label already exists in our set of urls, so this field + # is unparseable, and we can just add the whole thing to our + # unparseable data and stop processing it. + raise KeyError("duplicate labels in project urls") + urls[label] = url + + return urls + + +def _get_payload(msg: email.message.Message, source: bytes | str) -> str: + """Get the body of the message.""" + # If our source is a str, then our caller has managed encodings for us, + # and we don't need to deal with it. + if isinstance(source, str): + payload = msg.get_payload() + assert isinstance(payload, str) + return payload + # If our source is a bytes, then we're managing the encoding and we need + # to deal with it. + else: + bpayload = msg.get_payload(decode=True) + assert isinstance(bpayload, bytes) + try: + return bpayload.decode("utf8", "strict") + except UnicodeDecodeError as exc: + raise ValueError("payload in an invalid encoding") from exc + + +# The various parse_FORMAT functions here are intended to be as lenient as +# possible in their parsing, while still returning a correctly typed +# RawMetadata. +# +# To aid in this, we also generally want to do as little touching of the +# data as possible, except where there are possibly some historic holdovers +# that make valid data awkward to work with. +# +# While this is a lower level, intermediate format than our ``Metadata`` +# class, some light touch ups can make a massive difference in usability. + +# Map METADATA fields to RawMetadata. +_EMAIL_TO_RAW_MAPPING = { + "author": "author", + "author-email": "author_email", + "classifier": "classifiers", + "description": "description", + "description-content-type": "description_content_type", + "download-url": "download_url", + "dynamic": "dynamic", + "home-page": "home_page", + "keywords": "keywords", + "license": "license", + "license-expression": "license_expression", + "license-file": "license_files", + "maintainer": "maintainer", + "maintainer-email": "maintainer_email", + "metadata-version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes-dist": "obsoletes_dist", + "platform": "platforms", + "project-url": "project_urls", + "provides": "provides", + "provides-dist": "provides_dist", + "provides-extra": "provides_extra", + "requires": "requires", + "requires-dist": "requires_dist", + "requires-external": "requires_external", + "requires-python": "requires_python", + "summary": "summary", + "supported-platform": "supported_platforms", + "version": "version", +} +_RAW_TO_EMAIL_MAPPING = {raw: email for email, raw in _EMAIL_TO_RAW_MAPPING.items()} + + +def parse_email(data: bytes | str) -> tuple[RawMetadata, dict[str, list[str]]]: + """Parse a distribution's metadata stored as email headers (e.g. from ``METADATA``). + + This function returns a two-item tuple of dicts. The first dict is of + recognized fields from the core metadata specification. Fields that can be + parsed and translated into Python's built-in types are converted + appropriately. All other fields are left as-is. Fields that are allowed to + appear multiple times are stored as lists. + + The second dict contains all other fields from the metadata. This includes + any unrecognized fields. It also includes any fields which are expected to + be parsed into a built-in type but were not formatted appropriately. Finally, + any fields that are expected to appear only once but are repeated are + included in this dict. + + """ + raw: dict[str, str | list[str] | dict[str, str]] = {} + unparsed: dict[str, list[str]] = {} + + if isinstance(data, str): + parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) + else: + parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data) + + # We have to wrap parsed.keys() in a set, because in the case of multiple + # values for a key (a list), the key will appear multiple times in the + # list of keys, but we're avoiding that by using get_all(). + for name in frozenset(parsed.keys()): + # Header names in RFC are case insensitive, so we'll normalize to all + # lower case to make comparisons easier. + name = name.lower() + + # We use get_all() here, even for fields that aren't multiple use, + # because otherwise someone could have e.g. two Name fields, and we + # would just silently ignore it rather than doing something about it. + headers = parsed.get_all(name) or [] + + # The way the email module works when parsing bytes is that it + # unconditionally decodes the bytes as ascii using the surrogateescape + # handler. When you pull that data back out (such as with get_all() ), + # it looks to see if the str has any surrogate escapes, and if it does + # it wraps it in a Header object instead of returning the string. + # + # As such, we'll look for those Header objects, and fix up the encoding. + value = [] + # Flag if we have run into any issues processing the headers, thus + # signalling that the data belongs in 'unparsed'. + valid_encoding = True + for h in headers: + # It's unclear if this can return more types than just a Header or + # a str, so we'll just assert here to make sure. + assert isinstance(h, (email.header.Header, str)) + + # If it's a header object, we need to do our little dance to get + # the real data out of it. In cases where there is invalid data + # we're going to end up with mojibake, but there's no obvious, good + # way around that without reimplementing parts of the Header object + # ourselves. + # + # That should be fine since, if mojibacked happens, this key is + # going into the unparsed dict anyways. + if isinstance(h, email.header.Header): + # The Header object stores it's data as chunks, and each chunk + # can be independently encoded, so we'll need to check each + # of them. + chunks: list[tuple[bytes, str | None]] = [] + for bin, encoding in email.header.decode_header(h): + try: + bin.decode("utf8", "strict") + except UnicodeDecodeError: + # Enable mojibake. + encoding = "latin1" + valid_encoding = False + else: + encoding = "utf8" + chunks.append((bin, encoding)) + + # Turn our chunks back into a Header object, then let that + # Header object do the right thing to turn them into a + # string for us. + value.append(str(email.header.make_header(chunks))) + # This is already a string, so just add it. + else: + value.append(h) + + # We've processed all of our values to get them into a list of str, + # but we may have mojibake data, in which case this is an unparsed + # field. + if not valid_encoding: + unparsed[name] = value + continue + + raw_name = _EMAIL_TO_RAW_MAPPING.get(name) + if raw_name is None: + # This is a bit of a weird situation, we've encountered a key that + # we don't know what it means, so we don't know whether it's meant + # to be a list or not. + # + # Since we can't really tell one way or another, we'll just leave it + # as a list, even though it may be a single item list, because that's + # what makes the most sense for email headers. + unparsed[name] = value + continue + + # If this is one of our string fields, then we'll check to see if our + # value is a list of a single item. If it is then we'll assume that + # it was emitted as a single string, and unwrap the str from inside + # the list. + # + # If it's any other kind of data, then we haven't the faintest clue + # what we should parse it as, and we have to just add it to our list + # of unparsed stuff. + if raw_name in _STRING_FIELDS and len(value) == 1: + raw[raw_name] = value[0] + # If this is one of our list of string fields, then we can just assign + # the value, since email *only* has strings, and our get_all() call + # above ensures that this is a list. + elif raw_name in _LIST_FIELDS: + raw[raw_name] = value + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings, and is serialized using + # ", ".join(keywords), so we'll do some light data massaging to turn + # this into what it logically is. + elif raw_name == "keywords" and len(value) == 1: + raw[raw_name] = _parse_keywords(value[0]) + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially-formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif raw_name == "project_urls": + try: + raw[raw_name] = _parse_project_urls(value) + except KeyError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to support getting the Description from the message payload in + # addition to getting it from the the headers. This does mean, though, there + # is the possibility of it being set both ways, in which case we put both + # in 'unparsed' since we don't know which is right. + try: + payload = _get_payload(parsed, data) + except ValueError: + unparsed.setdefault("description", []).append( + parsed.get_payload(decode=isinstance(data, bytes)) # type: ignore[call-overload] + ) + else: + if payload: + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + description_header = cast(str, raw.pop("description")) + unparsed.setdefault("description", []).extend( + [description_header, payload] + ) + elif "description" in unparsed: + unparsed["description"].append(payload) + else: + raw["description"] = payload + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed + + +_NOT_FOUND = object() + + +# Keep the two values in sync. +_VALID_METADATA_VERSIONS = ["1.0", "1.1", "1.2", "2.1", "2.2", "2.3", "2.4"] +_MetadataVersion = Literal["1.0", "1.1", "1.2", "2.1", "2.2", "2.3", "2.4"] + +_REQUIRED_ATTRS = frozenset(["metadata_version", "name", "version"]) + + +class _Validator(Generic[T]): + """Validate a metadata field. + + All _process_*() methods correspond to a core metadata field. The method is + called with the field's raw value. If the raw value is valid it is returned + in its "enriched" form (e.g. ``version.Version`` for the ``Version`` field). + If the raw value is invalid, :exc:`InvalidMetadata` is raised (with a cause + as appropriate). + """ + + name: str + raw_name: str + added: _MetadataVersion + + def __init__( + self, + *, + added: _MetadataVersion = "1.0", + ) -> None: + self.added = added + + def __set_name__(self, _owner: Metadata, name: str) -> None: + self.name = name + self.raw_name = _RAW_TO_EMAIL_MAPPING[name] + + def __get__(self, instance: Metadata, _owner: type[Metadata]) -> T: + # With Python 3.8, the caching can be replaced with functools.cached_property(). + # No need to check the cache as attribute lookup will resolve into the + # instance's __dict__ before __get__ is called. + cache = instance.__dict__ + value = instance._raw.get(self.name) + + # To make the _process_* methods easier, we'll check if the value is None + # and if this field is NOT a required attribute, and if both of those + # things are true, we'll skip the the converter. This will mean that the + # converters never have to deal with the None union. + if self.name in _REQUIRED_ATTRS or value is not None: + try: + converter: Callable[[Any], T] = getattr(self, f"_process_{self.name}") + except AttributeError: + pass + else: + value = converter(value) + + cache[self.name] = value + try: + del instance._raw[self.name] # type: ignore[misc] + except KeyError: + pass + + return cast(T, value) + + def _invalid_metadata( + self, msg: str, cause: Exception | None = None + ) -> InvalidMetadata: + exc = InvalidMetadata( + self.raw_name, msg.format_map({"field": repr(self.raw_name)}) + ) + exc.__cause__ = cause + return exc + + def _process_metadata_version(self, value: str) -> _MetadataVersion: + # Implicitly makes Metadata-Version required. + if value not in _VALID_METADATA_VERSIONS: + raise self._invalid_metadata(f"{value!r} is not a valid metadata version") + return cast(_MetadataVersion, value) + + def _process_name(self, value: str) -> str: + if not value: + raise self._invalid_metadata("{field} is a required field") + # Validate the name as a side-effect. + try: + utils.canonicalize_name(value, validate=True) + except utils.InvalidName as exc: + raise self._invalid_metadata( + f"{value!r} is invalid for {{field}}", cause=exc + ) from exc + else: + return value + + def _process_version(self, value: str) -> version_module.Version: + if not value: + raise self._invalid_metadata("{field} is a required field") + try: + return version_module.parse(value) + except version_module.InvalidVersion as exc: + raise self._invalid_metadata( + f"{value!r} is invalid for {{field}}", cause=exc + ) from exc + + def _process_summary(self, value: str) -> str: + """Check the field contains no newlines.""" + if "\n" in value: + raise self._invalid_metadata("{field} must be a single line") + return value + + def _process_description_content_type(self, value: str) -> str: + content_types = {"text/plain", "text/x-rst", "text/markdown"} + message = email.message.EmailMessage() + message["content-type"] = value + + content_type, parameters = ( + # Defaults to `text/plain` if parsing failed. + message.get_content_type().lower(), + message["content-type"].params, + ) + # Check if content-type is valid or defaulted to `text/plain` and thus was + # not parseable. + if content_type not in content_types or content_type not in value.lower(): + raise self._invalid_metadata( + f"{{field}} must be one of {list(content_types)}, not {value!r}" + ) + + charset = parameters.get("charset", "UTF-8") + if charset != "UTF-8": + raise self._invalid_metadata( + f"{{field}} can only specify the UTF-8 charset, not {list(charset)}" + ) + + markdown_variants = {"GFM", "CommonMark"} + variant = parameters.get("variant", "GFM") # Use an acceptable default. + if content_type == "text/markdown" and variant not in markdown_variants: + raise self._invalid_metadata( + f"valid Markdown variants for {{field}} are {list(markdown_variants)}, " + f"not {variant!r}", + ) + return value + + def _process_dynamic(self, value: list[str]) -> list[str]: + for dynamic_field in map(str.lower, value): + if dynamic_field in {"name", "version", "metadata-version"}: + raise self._invalid_metadata( + f"{dynamic_field!r} is not allowed as a dynamic field" + ) + elif dynamic_field not in _EMAIL_TO_RAW_MAPPING: + raise self._invalid_metadata( + f"{dynamic_field!r} is not a valid dynamic field" + ) + return list(map(str.lower, value)) + + def _process_provides_extra( + self, + value: list[str], + ) -> list[utils.NormalizedName]: + normalized_names = [] + try: + for name in value: + normalized_names.append(utils.canonicalize_name(name, validate=True)) + except utils.InvalidName as exc: + raise self._invalid_metadata( + f"{name!r} is invalid for {{field}}", cause=exc + ) from exc + else: + return normalized_names + + def _process_requires_python(self, value: str) -> specifiers.SpecifierSet: + try: + return specifiers.SpecifierSet(value) + except specifiers.InvalidSpecifier as exc: + raise self._invalid_metadata( + f"{value!r} is invalid for {{field}}", cause=exc + ) from exc + + def _process_requires_dist( + self, + value: list[str], + ) -> list[requirements.Requirement]: + reqs = [] + try: + for req in value: + reqs.append(requirements.Requirement(req)) + except requirements.InvalidRequirement as exc: + raise self._invalid_metadata( + f"{req!r} is invalid for {{field}}", cause=exc + ) from exc + else: + return reqs + + def _process_license_expression( + self, value: str + ) -> NormalizedLicenseExpression | None: + try: + return licenses.canonicalize_license_expression(value) + except ValueError as exc: + raise self._invalid_metadata( + f"{value!r} is invalid for {{field}}", cause=exc + ) from exc + + def _process_license_files(self, value: list[str]) -> list[str]: + paths = [] + for path in value: + if ".." in path: + raise self._invalid_metadata( + f"{path!r} is invalid for {{field}}, " + "parent directory indicators are not allowed" + ) + if "*" in path: + raise self._invalid_metadata( + f"{path!r} is invalid for {{field}}, paths must be resolved" + ) + if ( + pathlib.PurePosixPath(path).is_absolute() + or pathlib.PureWindowsPath(path).is_absolute() + ): + raise self._invalid_metadata( + f"{path!r} is invalid for {{field}}, paths must be relative" + ) + if pathlib.PureWindowsPath(path).as_posix() != path: + raise self._invalid_metadata( + f"{path!r} is invalid for {{field}}, " + "paths must use '/' delimiter" + ) + paths.append(path) + return paths + + +class Metadata: + """Representation of distribution metadata. + + Compared to :class:`RawMetadata`, this class provides objects representing + metadata fields instead of only using built-in types. Any invalid metadata + will cause :exc:`InvalidMetadata` to be raised (with a + :py:attr:`~BaseException.__cause__` attribute as appropriate). + """ + + _raw: RawMetadata + + @classmethod + def from_raw(cls, data: RawMetadata, *, validate: bool = True) -> Metadata: + """Create an instance from :class:`RawMetadata`. + + If *validate* is true, all metadata will be validated. All exceptions + related to validation will be gathered and raised as an :class:`ExceptionGroup`. + """ + ins = cls() + ins._raw = data.copy() # Mutations occur due to caching enriched values. + + if validate: + exceptions: list[Exception] = [] + try: + metadata_version = ins.metadata_version + metadata_age = _VALID_METADATA_VERSIONS.index(metadata_version) + except InvalidMetadata as metadata_version_exc: + exceptions.append(metadata_version_exc) + metadata_version = None + + # Make sure to check for the fields that are present, the required + # fields (so their absence can be reported). + fields_to_check = frozenset(ins._raw) | _REQUIRED_ATTRS + # Remove fields that have already been checked. + fields_to_check -= {"metadata_version"} + + for key in fields_to_check: + try: + if metadata_version: + # Can't use getattr() as that triggers descriptor protocol which + # will fail due to no value for the instance argument. + try: + field_metadata_version = cls.__dict__[key].added + except KeyError: + exc = InvalidMetadata(key, f"unrecognized field: {key!r}") + exceptions.append(exc) + continue + field_age = _VALID_METADATA_VERSIONS.index( + field_metadata_version + ) + if field_age > metadata_age: + field = _RAW_TO_EMAIL_MAPPING[key] + exc = InvalidMetadata( + field, + f"{field} introduced in metadata version " + f"{field_metadata_version}, not {metadata_version}", + ) + exceptions.append(exc) + continue + getattr(ins, key) + except InvalidMetadata as exc: + exceptions.append(exc) + + if exceptions: + raise ExceptionGroup("invalid metadata", exceptions) + + return ins + + @classmethod + def from_email(cls, data: bytes | str, *, validate: bool = True) -> Metadata: + """Parse metadata from email headers. + + If *validate* is true, the metadata will be validated. All exceptions + related to validation will be gathered and raised as an :class:`ExceptionGroup`. + """ + raw, unparsed = parse_email(data) + + if validate: + exceptions: list[Exception] = [] + for unparsed_key in unparsed: + if unparsed_key in _EMAIL_TO_RAW_MAPPING: + message = f"{unparsed_key!r} has invalid data" + else: + message = f"unrecognized field: {unparsed_key!r}" + exceptions.append(InvalidMetadata(unparsed_key, message)) + + if exceptions: + raise ExceptionGroup("unparsed", exceptions) + + try: + return cls.from_raw(raw, validate=validate) + except ExceptionGroup as exc_group: + raise ExceptionGroup( + "invalid or unparsed metadata", exc_group.exceptions + ) from None + + metadata_version: _Validator[_MetadataVersion] = _Validator() + """:external:ref:`core-metadata-metadata-version` + (required; validated to be a valid metadata version)""" + # `name` is not normalized/typed to NormalizedName so as to provide access to + # the original/raw name. + name: _Validator[str] = _Validator() + """:external:ref:`core-metadata-name` + (required; validated using :func:`~packaging.utils.canonicalize_name` and its + *validate* parameter)""" + version: _Validator[version_module.Version] = _Validator() + """:external:ref:`core-metadata-version` (required)""" + dynamic: _Validator[list[str] | None] = _Validator( + added="2.2", + ) + """:external:ref:`core-metadata-dynamic` + (validated against core metadata field names and lowercased)""" + platforms: _Validator[list[str] | None] = _Validator() + """:external:ref:`core-metadata-platform`""" + supported_platforms: _Validator[list[str] | None] = _Validator(added="1.1") + """:external:ref:`core-metadata-supported-platform`""" + summary: _Validator[str | None] = _Validator() + """:external:ref:`core-metadata-summary` (validated to contain no newlines)""" + description: _Validator[str | None] = _Validator() # TODO 2.1: can be in body + """:external:ref:`core-metadata-description`""" + description_content_type: _Validator[str | None] = _Validator(added="2.1") + """:external:ref:`core-metadata-description-content-type` (validated)""" + keywords: _Validator[list[str] | None] = _Validator() + """:external:ref:`core-metadata-keywords`""" + home_page: _Validator[str | None] = _Validator() + """:external:ref:`core-metadata-home-page`""" + download_url: _Validator[str | None] = _Validator(added="1.1") + """:external:ref:`core-metadata-download-url`""" + author: _Validator[str | None] = _Validator() + """:external:ref:`core-metadata-author`""" + author_email: _Validator[str | None] = _Validator() + """:external:ref:`core-metadata-author-email`""" + maintainer: _Validator[str | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-maintainer`""" + maintainer_email: _Validator[str | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-maintainer-email`""" + license: _Validator[str | None] = _Validator() + """:external:ref:`core-metadata-license`""" + license_expression: _Validator[NormalizedLicenseExpression | None] = _Validator( + added="2.4" + ) + """:external:ref:`core-metadata-license-expression`""" + license_files: _Validator[list[str] | None] = _Validator(added="2.4") + """:external:ref:`core-metadata-license-file`""" + classifiers: _Validator[list[str] | None] = _Validator(added="1.1") + """:external:ref:`core-metadata-classifier`""" + requires_dist: _Validator[list[requirements.Requirement] | None] = _Validator( + added="1.2" + ) + """:external:ref:`core-metadata-requires-dist`""" + requires_python: _Validator[specifiers.SpecifierSet | None] = _Validator( + added="1.2" + ) + """:external:ref:`core-metadata-requires-python`""" + # Because `Requires-External` allows for non-PEP 440 version specifiers, we + # don't do any processing on the values. + requires_external: _Validator[list[str] | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-requires-external`""" + project_urls: _Validator[dict[str, str] | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-project-url`""" + # PEP 685 lets us raise an error if an extra doesn't pass `Name` validation + # regardless of metadata version. + provides_extra: _Validator[list[utils.NormalizedName] | None] = _Validator( + added="2.1", + ) + """:external:ref:`core-metadata-provides-extra`""" + provides_dist: _Validator[list[str] | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-provides-dist`""" + obsoletes_dist: _Validator[list[str] | None] = _Validator(added="1.2") + """:external:ref:`core-metadata-obsoletes-dist`""" + requires: _Validator[list[str] | None] = _Validator(added="1.1") + """``Requires`` (deprecated)""" + provides: _Validator[list[str] | None] = _Validator(added="1.1") + """``Provides`` (deprecated)""" + obsoletes: _Validator[list[str] | None] = _Validator(added="1.1") + """``Obsoletes`` (deprecated)""" diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py new file mode 100644 index 0000000000000000000000000000000000000000..b30926af8bf4f47efe98eea44d5ded4cb6f7e07d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/specifiers.py @@ -0,0 +1,1020 @@ +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. See the LICENSE file in the root of this repository +# for complete details. +""" +.. testsetup:: + + from packaging.specifiers import Specifier, SpecifierSet, InvalidSpecifier + from packaging.version import Version +""" + +from __future__ import annotations + +import abc +import itertools +import re +from typing import Callable, Iterable, Iterator, TypeVar, Union + +from .utils import canonicalize_version +from .version import Version + +UnparsedVersion = Union[Version, str] +UnparsedVersionVar = TypeVar("UnparsedVersionVar", bound=UnparsedVersion) +CallableOperator = Callable[[Version, str], bool] + + +def _coerce_version(version: UnparsedVersion) -> Version: + if not isinstance(version, Version): + version = Version(version) + return version + + +class InvalidSpecifier(ValueError): + """ + Raised when attempting to create a :class:`Specifier` with a specifier + string that is invalid. + + >>> Specifier("lolwat") + Traceback (most recent call last): + ... + packaging.specifiers.InvalidSpecifier: Invalid specifier: 'lolwat' + """ + + +class BaseSpecifier(metaclass=abc.ABCMeta): + @abc.abstractmethod + def __str__(self) -> str: + """ + Returns the str representation of this Specifier-like object. This + should be representative of the Specifier itself. + """ + + @abc.abstractmethod + def __hash__(self) -> int: + """ + Returns a hash value for this Specifier-like object. + """ + + @abc.abstractmethod + def __eq__(self, other: object) -> bool: + """ + Returns a boolean representing whether or not the two Specifier-like + objects are equal. + + :param other: The other object to check against. + """ + + @property + @abc.abstractmethod + def prereleases(self) -> bool | None: + """Whether or not pre-releases as a whole are allowed. + + This can be set to either ``True`` or ``False`` to explicitly enable or disable + prereleases or it can be set to ``None`` (the default) to use default semantics. + """ + + @prereleases.setter + def prereleases(self, value: bool) -> None: + """Setter for :attr:`prereleases`. + + :param value: The value to set. + """ + + @abc.abstractmethod + def contains(self, item: str, prereleases: bool | None = None) -> bool: + """ + Determines if the given item is contained within this specifier. + """ + + @abc.abstractmethod + def filter( + self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None + ) -> Iterator[UnparsedVersionVar]: + """ + Takes an iterable of items and filters them so that only items which + are contained within this specifier are allowed in it. + """ + + +class Specifier(BaseSpecifier): + """This class abstracts handling of version specifiers. + + .. tip:: + + It is generally not required to instantiate this manually. You should instead + prefer to work with :class:`SpecifierSet` instead, which can parse + comma-separated version specifiers (which is what package metadata contains). + """ + + _operator_regex_str = r""" + (?P(~=|==|!=|<=|>=|<|>|===)) + """ + _version_regex_str = r""" + (?P + (?: + # The identity operators allow for an escape hatch that will + # do an exact string match of the version you wish to install. + # This will not be parsed by PEP 440 and we cannot determine + # any semantic meaning from it. This operator is discouraged + # but included entirely as an escape hatch. + (?<====) # Only match for the identity operator + \s* + [^\s;)]* # The arbitrary version can be just about anything, + # we match everything except for whitespace, a + # semi-colon for marker support, and a closing paren + # since versions can be enclosed in them. + ) + | + (?: + # The (non)equality operators allow for wild card and local + # versions to be specified so we have to define these two + # operators separately to enable that. + (?<===|!=) # Only match for equals and not equals + + \s* + v? + (?:[0-9]+!)? # epoch + [0-9]+(?:\.[0-9]+)* # release + + # You cannot use a wild card and a pre-release, post-release, a dev or + # local version together so group them with a | and make them optional. + (?: + \.\* # Wild card syntax of .* + | + (?: # pre release + [-_\.]? + (alpha|beta|preview|pre|a|b|c|rc) + [-_\.]? + [0-9]* + )? + (?: # post release + (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*) + )? + (?:[-_\.]?dev[-_\.]?[0-9]*)? # dev release + (?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)? # local + )? + ) + | + (?: + # The compatible operator requires at least two digits in the + # release segment. + (?<=~=) # Only match for the compatible operator + + \s* + v? + (?:[0-9]+!)? # epoch + [0-9]+(?:\.[0-9]+)+ # release (We have a + instead of a *) + (?: # pre release + [-_\.]? + (alpha|beta|preview|pre|a|b|c|rc) + [-_\.]? + [0-9]* + )? + (?: # post release + (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*) + )? + (?:[-_\.]?dev[-_\.]?[0-9]*)? # dev release + ) + | + (?: + # All other operators only allow a sub set of what the + # (non)equality operators do. Specifically they do not allow + # local versions to be specified nor do they allow the prefix + # matching wild cards. + (?=": "greater_than_equal", + "<": "less_than", + ">": "greater_than", + "===": "arbitrary", + } + + def __init__(self, spec: str = "", prereleases: bool | None = None) -> None: + """Initialize a Specifier instance. + + :param spec: + The string representation of a specifier which will be parsed and + normalized before use. + :param prereleases: + This tells the specifier if it should accept prerelease versions if + applicable or not. The default of ``None`` will autodetect it from the + given specifiers. + :raises InvalidSpecifier: + If the given specifier is invalid (i.e. bad syntax). + """ + match = self._regex.search(spec) + if not match: + raise InvalidSpecifier(f"Invalid specifier: {spec!r}") + + self._spec: tuple[str, str] = ( + match.group("operator").strip(), + match.group("version").strip(), + ) + + # Store whether or not this Specifier should accept prereleases + self._prereleases = prereleases + + # https://github.com/python/mypy/pull/13475#pullrequestreview-1079784515 + @property # type: ignore[override] + def prereleases(self) -> bool: + # If there is an explicit prereleases set for this, then we'll just + # blindly use that. + if self._prereleases is not None: + return self._prereleases + + # Look at all of our specifiers and determine if they are inclusive + # operators, and if they are if they are including an explicit + # prerelease. + operator, version = self._spec + if operator in ["==", ">=", "<=", "~=", "===", ">", "<"]: + # The == specifier can include a trailing .*, if it does we + # want to remove before parsing. + if operator == "==" and version.endswith(".*"): + version = version[:-2] + + # Parse the version, and if it is a pre-release than this + # specifier allows pre-releases. + if Version(version).is_prerelease: + return True + + return False + + @prereleases.setter + def prereleases(self, value: bool) -> None: + self._prereleases = value + + @property + def operator(self) -> str: + """The operator of this specifier. + + >>> Specifier("==1.2.3").operator + '==' + """ + return self._spec[0] + + @property + def version(self) -> str: + """The version of this specifier. + + >>> Specifier("==1.2.3").version + '1.2.3' + """ + return self._spec[1] + + def __repr__(self) -> str: + """A representation of the Specifier that shows all internal state. + + >>> Specifier('>=1.0.0') + =1.0.0')> + >>> Specifier('>=1.0.0', prereleases=False) + =1.0.0', prereleases=False)> + >>> Specifier('>=1.0.0', prereleases=True) + =1.0.0', prereleases=True)> + """ + pre = ( + f", prereleases={self.prereleases!r}" + if self._prereleases is not None + else "" + ) + + return f"<{self.__class__.__name__}({str(self)!r}{pre})>" + + def __str__(self) -> str: + """A string representation of the Specifier that can be round-tripped. + + >>> str(Specifier('>=1.0.0')) + '>=1.0.0' + >>> str(Specifier('>=1.0.0', prereleases=False)) + '>=1.0.0' + """ + return "{}{}".format(*self._spec) + + @property + def _canonical_spec(self) -> tuple[str, str]: + canonical_version = canonicalize_version( + self._spec[1], + strip_trailing_zero=(self._spec[0] != "~="), + ) + return self._spec[0], canonical_version + + def __hash__(self) -> int: + return hash(self._canonical_spec) + + def __eq__(self, other: object) -> bool: + """Whether or not the two Specifier-like objects are equal. + + :param other: The other object to check against. + + The value of :attr:`prereleases` is ignored. + + >>> Specifier("==1.2.3") == Specifier("== 1.2.3.0") + True + >>> (Specifier("==1.2.3", prereleases=False) == + ... Specifier("==1.2.3", prereleases=True)) + True + >>> Specifier("==1.2.3") == "==1.2.3" + True + >>> Specifier("==1.2.3") == Specifier("==1.2.4") + False + >>> Specifier("==1.2.3") == Specifier("~=1.2.3") + False + """ + if isinstance(other, str): + try: + other = self.__class__(str(other)) + except InvalidSpecifier: + return NotImplemented + elif not isinstance(other, self.__class__): + return NotImplemented + + return self._canonical_spec == other._canonical_spec + + def _get_operator(self, op: str) -> CallableOperator: + operator_callable: CallableOperator = getattr( + self, f"_compare_{self._operators[op]}" + ) + return operator_callable + + def _compare_compatible(self, prospective: Version, spec: str) -> bool: + # Compatible releases have an equivalent combination of >= and ==. That + # is that ~=2.2 is equivalent to >=2.2,==2.*. This allows us to + # implement this in terms of the other specifiers instead of + # implementing it ourselves. The only thing we need to do is construct + # the other specifiers. + + # We want everything but the last item in the version, but we want to + # ignore suffix segments. + prefix = _version_join( + list(itertools.takewhile(_is_not_suffix, _version_split(spec)))[:-1] + ) + + # Add the prefix notation to the end of our string + prefix += ".*" + + return self._get_operator(">=")(prospective, spec) and self._get_operator("==")( + prospective, prefix + ) + + def _compare_equal(self, prospective: Version, spec: str) -> bool: + # We need special logic to handle prefix matching + if spec.endswith(".*"): + # In the case of prefix matching we want to ignore local segment. + normalized_prospective = canonicalize_version( + prospective.public, strip_trailing_zero=False + ) + # Get the normalized version string ignoring the trailing .* + normalized_spec = canonicalize_version(spec[:-2], strip_trailing_zero=False) + # Split the spec out by bangs and dots, and pretend that there is + # an implicit dot in between a release segment and a pre-release segment. + split_spec = _version_split(normalized_spec) + + # Split the prospective version out by bangs and dots, and pretend + # that there is an implicit dot in between a release segment and + # a pre-release segment. + split_prospective = _version_split(normalized_prospective) + + # 0-pad the prospective version before shortening it to get the correct + # shortened version. + padded_prospective, _ = _pad_version(split_prospective, split_spec) + + # Shorten the prospective version to be the same length as the spec + # so that we can determine if the specifier is a prefix of the + # prospective version or not. + shortened_prospective = padded_prospective[: len(split_spec)] + + return shortened_prospective == split_spec + else: + # Convert our spec string into a Version + spec_version = Version(spec) + + # If the specifier does not have a local segment, then we want to + # act as if the prospective version also does not have a local + # segment. + if not spec_version.local: + prospective = Version(prospective.public) + + return prospective == spec_version + + def _compare_not_equal(self, prospective: Version, spec: str) -> bool: + return not self._compare_equal(prospective, spec) + + def _compare_less_than_equal(self, prospective: Version, spec: str) -> bool: + # NB: Local version identifiers are NOT permitted in the version + # specifier, so local version labels can be universally removed from + # the prospective version. + return Version(prospective.public) <= Version(spec) + + def _compare_greater_than_equal(self, prospective: Version, spec: str) -> bool: + # NB: Local version identifiers are NOT permitted in the version + # specifier, so local version labels can be universally removed from + # the prospective version. + return Version(prospective.public) >= Version(spec) + + def _compare_less_than(self, prospective: Version, spec_str: str) -> bool: + # Convert our spec to a Version instance, since we'll want to work with + # it as a version. + spec = Version(spec_str) + + # Check to see if the prospective version is less than the spec + # version. If it's not we can short circuit and just return False now + # instead of doing extra unneeded work. + if not prospective < spec: + return False + + # This special case is here so that, unless the specifier itself + # includes is a pre-release version, that we do not accept pre-release + # versions for the version mentioned in the specifier (e.g. <3.1 should + # not match 3.1.dev0, but should match 3.0.dev0). + if not spec.is_prerelease and prospective.is_prerelease: + if Version(prospective.base_version) == Version(spec.base_version): + return False + + # If we've gotten to here, it means that prospective version is both + # less than the spec version *and* it's not a pre-release of the same + # version in the spec. + return True + + def _compare_greater_than(self, prospective: Version, spec_str: str) -> bool: + # Convert our spec to a Version instance, since we'll want to work with + # it as a version. + spec = Version(spec_str) + + # Check to see if the prospective version is greater than the spec + # version. If it's not we can short circuit and just return False now + # instead of doing extra unneeded work. + if not prospective > spec: + return False + + # This special case is here so that, unless the specifier itself + # includes is a post-release version, that we do not accept + # post-release versions for the version mentioned in the specifier + # (e.g. >3.1 should not match 3.0.post0, but should match 3.2.post0). + if not spec.is_postrelease and prospective.is_postrelease: + if Version(prospective.base_version) == Version(spec.base_version): + return False + + # Ensure that we do not allow a local version of the version mentioned + # in the specifier, which is technically greater than, to match. + if prospective.local is not None: + if Version(prospective.base_version) == Version(spec.base_version): + return False + + # If we've gotten to here, it means that prospective version is both + # greater than the spec version *and* it's not a pre-release of the + # same version in the spec. + return True + + def _compare_arbitrary(self, prospective: Version, spec: str) -> bool: + return str(prospective).lower() == str(spec).lower() + + def __contains__(self, item: str | Version) -> bool: + """Return whether or not the item is contained in this specifier. + + :param item: The item to check for. + + This is used for the ``in`` operator and behaves the same as + :meth:`contains` with no ``prereleases`` argument passed. + + >>> "1.2.3" in Specifier(">=1.2.3") + True + >>> Version("1.2.3") in Specifier(">=1.2.3") + True + >>> "1.0.0" in Specifier(">=1.2.3") + False + >>> "1.3.0a1" in Specifier(">=1.2.3") + False + >>> "1.3.0a1" in Specifier(">=1.2.3", prereleases=True) + True + """ + return self.contains(item) + + def contains(self, item: UnparsedVersion, prereleases: bool | None = None) -> bool: + """Return whether or not the item is contained in this specifier. + + :param item: + The item to check for, which can be a version string or a + :class:`Version` instance. + :param prereleases: + Whether or not to match prereleases with this Specifier. If set to + ``None`` (the default), it uses :attr:`prereleases` to determine + whether or not prereleases are allowed. + + >>> Specifier(">=1.2.3").contains("1.2.3") + True + >>> Specifier(">=1.2.3").contains(Version("1.2.3")) + True + >>> Specifier(">=1.2.3").contains("1.0.0") + False + >>> Specifier(">=1.2.3").contains("1.3.0a1") + False + >>> Specifier(">=1.2.3", prereleases=True).contains("1.3.0a1") + True + >>> Specifier(">=1.2.3").contains("1.3.0a1", prereleases=True) + True + """ + + # Determine if prereleases are to be allowed or not. + if prereleases is None: + prereleases = self.prereleases + + # Normalize item to a Version, this allows us to have a shortcut for + # "2.0" in Specifier(">=2") + normalized_item = _coerce_version(item) + + # Determine if we should be supporting prereleases in this specifier + # or not, if we do not support prereleases than we can short circuit + # logic if this version is a prereleases. + if normalized_item.is_prerelease and not prereleases: + return False + + # Actually do the comparison to determine if this item is contained + # within this Specifier or not. + operator_callable: CallableOperator = self._get_operator(self.operator) + return operator_callable(normalized_item, self.version) + + def filter( + self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None + ) -> Iterator[UnparsedVersionVar]: + """Filter items in the given iterable, that match the specifier. + + :param iterable: + An iterable that can contain version strings and :class:`Version` instances. + The items in the iterable will be filtered according to the specifier. + :param prereleases: + Whether or not to allow prereleases in the returned iterator. If set to + ``None`` (the default), it will be intelligently decide whether to allow + prereleases or not (based on the :attr:`prereleases` attribute, and + whether the only versions matching are prereleases). + + This method is smarter than just ``filter(Specifier().contains, [...])`` + because it implements the rule from :pep:`440` that a prerelease item + SHOULD be accepted if no other versions match the given specifier. + + >>> list(Specifier(">=1.2.3").filter(["1.2", "1.3", "1.5a1"])) + ['1.3'] + >>> list(Specifier(">=1.2.3").filter(["1.2", "1.2.3", "1.3", Version("1.4")])) + ['1.2.3', '1.3', ] + >>> list(Specifier(">=1.2.3").filter(["1.2", "1.5a1"])) + ['1.5a1'] + >>> list(Specifier(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True)) + ['1.3', '1.5a1'] + >>> list(Specifier(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"])) + ['1.3', '1.5a1'] + """ + + yielded = False + found_prereleases = [] + + kw = {"prereleases": prereleases if prereleases is not None else True} + + # Attempt to iterate over all the values in the iterable and if any of + # them match, yield them. + for version in iterable: + parsed_version = _coerce_version(version) + + if self.contains(parsed_version, **kw): + # If our version is a prerelease, and we were not set to allow + # prereleases, then we'll store it for later in case nothing + # else matches this specifier. + if parsed_version.is_prerelease and not ( + prereleases or self.prereleases + ): + found_prereleases.append(version) + # Either this is not a prerelease, or we should have been + # accepting prereleases from the beginning. + else: + yielded = True + yield version + + # Now that we've iterated over everything, determine if we've yielded + # any values, and if we have not and we have any prereleases stored up + # then we will go ahead and yield the prereleases. + if not yielded and found_prereleases: + for version in found_prereleases: + yield version + + +_prefix_regex = re.compile(r"^([0-9]+)((?:a|b|c|rc)[0-9]+)$") + + +def _version_split(version: str) -> list[str]: + """Split version into components. + + The split components are intended for version comparison. The logic does + not attempt to retain the original version string, so joining the + components back with :func:`_version_join` may not produce the original + version string. + """ + result: list[str] = [] + + epoch, _, rest = version.rpartition("!") + result.append(epoch or "0") + + for item in rest.split("."): + match = _prefix_regex.search(item) + if match: + result.extend(match.groups()) + else: + result.append(item) + return result + + +def _version_join(components: list[str]) -> str: + """Join split version components into a version string. + + This function assumes the input came from :func:`_version_split`, where the + first component must be the epoch (either empty or numeric), and all other + components numeric. + """ + epoch, *rest = components + return f"{epoch}!{'.'.join(rest)}" + + +def _is_not_suffix(segment: str) -> bool: + return not any( + segment.startswith(prefix) for prefix in ("dev", "a", "b", "rc", "post") + ) + + +def _pad_version(left: list[str], right: list[str]) -> tuple[list[str], list[str]]: + left_split, right_split = [], [] + + # Get the release segment of our versions + left_split.append(list(itertools.takewhile(lambda x: x.isdigit(), left))) + right_split.append(list(itertools.takewhile(lambda x: x.isdigit(), right))) + + # Get the rest of our versions + left_split.append(left[len(left_split[0]) :]) + right_split.append(right[len(right_split[0]) :]) + + # Insert our padding + left_split.insert(1, ["0"] * max(0, len(right_split[0]) - len(left_split[0]))) + right_split.insert(1, ["0"] * max(0, len(left_split[0]) - len(right_split[0]))) + + return ( + list(itertools.chain.from_iterable(left_split)), + list(itertools.chain.from_iterable(right_split)), + ) + + +class SpecifierSet(BaseSpecifier): + """This class abstracts handling of a set of version specifiers. + + It can be passed a single specifier (``>=3.0``), a comma-separated list of + specifiers (``>=3.0,!=3.1``), or no specifier at all. + """ + + def __init__( + self, + specifiers: str | Iterable[Specifier] = "", + prereleases: bool | None = None, + ) -> None: + """Initialize a SpecifierSet instance. + + :param specifiers: + The string representation of a specifier or a comma-separated list of + specifiers which will be parsed and normalized before use. + May also be an iterable of ``Specifier`` instances, which will be used + as is. + :param prereleases: + This tells the SpecifierSet if it should accept prerelease versions if + applicable or not. The default of ``None`` will autodetect it from the + given specifiers. + + :raises InvalidSpecifier: + If the given ``specifiers`` are not parseable than this exception will be + raised. + """ + + if isinstance(specifiers, str): + # Split on `,` to break each individual specifier into its own item, and + # strip each item to remove leading/trailing whitespace. + split_specifiers = [s.strip() for s in specifiers.split(",") if s.strip()] + + # Make each individual specifier a Specifier and save in a frozen set + # for later. + self._specs = frozenset(map(Specifier, split_specifiers)) + else: + # Save the supplied specifiers in a frozen set. + self._specs = frozenset(specifiers) + + # Store our prereleases value so we can use it later to determine if + # we accept prereleases or not. + self._prereleases = prereleases + + @property + def prereleases(self) -> bool | None: + # If we have been given an explicit prerelease modifier, then we'll + # pass that through here. + if self._prereleases is not None: + return self._prereleases + + # If we don't have any specifiers, and we don't have a forced value, + # then we'll just return None since we don't know if this should have + # pre-releases or not. + if not self._specs: + return None + + # Otherwise we'll see if any of the given specifiers accept + # prereleases, if any of them do we'll return True, otherwise False. + return any(s.prereleases for s in self._specs) + + @prereleases.setter + def prereleases(self, value: bool) -> None: + self._prereleases = value + + def __repr__(self) -> str: + """A representation of the specifier set that shows all internal state. + + Note that the ordering of the individual specifiers within the set may not + match the input string. + + >>> SpecifierSet('>=1.0.0,!=2.0.0') + =1.0.0')> + >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=False) + =1.0.0', prereleases=False)> + >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=True) + =1.0.0', prereleases=True)> + """ + pre = ( + f", prereleases={self.prereleases!r}" + if self._prereleases is not None + else "" + ) + + return f"" + + def __str__(self) -> str: + """A string representation of the specifier set that can be round-tripped. + + Note that the ordering of the individual specifiers within the set may not + match the input string. + + >>> str(SpecifierSet(">=1.0.0,!=1.0.1")) + '!=1.0.1,>=1.0.0' + >>> str(SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False)) + '!=1.0.1,>=1.0.0' + """ + return ",".join(sorted(str(s) for s in self._specs)) + + def __hash__(self) -> int: + return hash(self._specs) + + def __and__(self, other: SpecifierSet | str) -> SpecifierSet: + """Return a SpecifierSet which is a combination of the two sets. + + :param other: The other object to combine with. + + >>> SpecifierSet(">=1.0.0,!=1.0.1") & '<=2.0.0,!=2.0.1' + =1.0.0')> + >>> SpecifierSet(">=1.0.0,!=1.0.1") & SpecifierSet('<=2.0.0,!=2.0.1') + =1.0.0')> + """ + if isinstance(other, str): + other = SpecifierSet(other) + elif not isinstance(other, SpecifierSet): + return NotImplemented + + specifier = SpecifierSet() + specifier._specs = frozenset(self._specs | other._specs) + + if self._prereleases is None and other._prereleases is not None: + specifier._prereleases = other._prereleases + elif self._prereleases is not None and other._prereleases is None: + specifier._prereleases = self._prereleases + elif self._prereleases == other._prereleases: + specifier._prereleases = self._prereleases + else: + raise ValueError( + "Cannot combine SpecifierSets with True and False prerelease " + "overrides." + ) + + return specifier + + def __eq__(self, other: object) -> bool: + """Whether or not the two SpecifierSet-like objects are equal. + + :param other: The other object to check against. + + The value of :attr:`prereleases` is ignored. + + >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.1") + True + >>> (SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False) == + ... SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True)) + True + >>> SpecifierSet(">=1.0.0,!=1.0.1") == ">=1.0.0,!=1.0.1" + True + >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0") + False + >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.2") + False + """ + if isinstance(other, (str, Specifier)): + other = SpecifierSet(str(other)) + elif not isinstance(other, SpecifierSet): + return NotImplemented + + return self._specs == other._specs + + def __len__(self) -> int: + """Returns the number of specifiers in this specifier set.""" + return len(self._specs) + + def __iter__(self) -> Iterator[Specifier]: + """ + Returns an iterator over all the underlying :class:`Specifier` instances + in this specifier set. + + >>> sorted(SpecifierSet(">=1.0.0,!=1.0.1"), key=str) + [, =1.0.0')>] + """ + return iter(self._specs) + + def __contains__(self, item: UnparsedVersion) -> bool: + """Return whether or not the item is contained in this specifier. + + :param item: The item to check for. + + This is used for the ``in`` operator and behaves the same as + :meth:`contains` with no ``prereleases`` argument passed. + + >>> "1.2.3" in SpecifierSet(">=1.0.0,!=1.0.1") + True + >>> Version("1.2.3") in SpecifierSet(">=1.0.0,!=1.0.1") + True + >>> "1.0.1" in SpecifierSet(">=1.0.0,!=1.0.1") + False + >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1") + False + >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True) + True + """ + return self.contains(item) + + def contains( + self, + item: UnparsedVersion, + prereleases: bool | None = None, + installed: bool | None = None, + ) -> bool: + """Return whether or not the item is contained in this SpecifierSet. + + :param item: + The item to check for, which can be a version string or a + :class:`Version` instance. + :param prereleases: + Whether or not to match prereleases with this SpecifierSet. If set to + ``None`` (the default), it uses :attr:`prereleases` to determine + whether or not prereleases are allowed. + + >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.2.3") + True + >>> SpecifierSet(">=1.0.0,!=1.0.1").contains(Version("1.2.3")) + True + >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.0.1") + False + >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1") + False + >>> SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True).contains("1.3.0a1") + True + >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1", prereleases=True) + True + """ + # Ensure that our item is a Version instance. + if not isinstance(item, Version): + item = Version(item) + + # Determine if we're forcing a prerelease or not, if we're not forcing + # one for this particular filter call, then we'll use whatever the + # SpecifierSet thinks for whether or not we should support prereleases. + if prereleases is None: + prereleases = self.prereleases + + # We can determine if we're going to allow pre-releases by looking to + # see if any of the underlying items supports them. If none of them do + # and this item is a pre-release then we do not allow it and we can + # short circuit that here. + # Note: This means that 1.0.dev1 would not be contained in something + # like >=1.0.devabc however it would be in >=1.0.debabc,>0.0.dev0 + if not prereleases and item.is_prerelease: + return False + + if installed and item.is_prerelease: + item = Version(item.base_version) + + # We simply dispatch to the underlying specs here to make sure that the + # given version is contained within all of them. + # Note: This use of all() here means that an empty set of specifiers + # will always return True, this is an explicit design decision. + return all(s.contains(item, prereleases=prereleases) for s in self._specs) + + def filter( + self, iterable: Iterable[UnparsedVersionVar], prereleases: bool | None = None + ) -> Iterator[UnparsedVersionVar]: + """Filter items in the given iterable, that match the specifiers in this set. + + :param iterable: + An iterable that can contain version strings and :class:`Version` instances. + The items in the iterable will be filtered according to the specifier. + :param prereleases: + Whether or not to allow prereleases in the returned iterator. If set to + ``None`` (the default), it will be intelligently decide whether to allow + prereleases or not (based on the :attr:`prereleases` attribute, and + whether the only versions matching are prereleases). + + This method is smarter than just ``filter(SpecifierSet(...).contains, [...])`` + because it implements the rule from :pep:`440` that a prerelease item + SHOULD be accepted if no other versions match the given specifier. + + >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", "1.5a1"])) + ['1.3'] + >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", Version("1.4")])) + ['1.3', ] + >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.5a1"])) + [] + >>> list(SpecifierSet(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True)) + ['1.3', '1.5a1'] + >>> list(SpecifierSet(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"])) + ['1.3', '1.5a1'] + + An "empty" SpecifierSet will filter items based on the presence of prerelease + versions in the set. + + >>> list(SpecifierSet("").filter(["1.3", "1.5a1"])) + ['1.3'] + >>> list(SpecifierSet("").filter(["1.5a1"])) + ['1.5a1'] + >>> list(SpecifierSet("", prereleases=True).filter(["1.3", "1.5a1"])) + ['1.3', '1.5a1'] + >>> list(SpecifierSet("").filter(["1.3", "1.5a1"], prereleases=True)) + ['1.3', '1.5a1'] + """ + # Determine if we're forcing a prerelease or not, if we're not forcing + # one for this particular filter call, then we'll use whatever the + # SpecifierSet thinks for whether or not we should support prereleases. + if prereleases is None: + prereleases = self.prereleases + + # If we have any specifiers, then we want to wrap our iterable in the + # filter method for each one, this will act as a logical AND amongst + # each specifier. + if self._specs: + for spec in self._specs: + iterable = spec.filter(iterable, prereleases=bool(prereleases)) + return iter(iterable) + # If we do not have any specifiers, then we need to have a rough filter + # which will filter out any pre-releases, unless there are no final + # releases. + else: + filtered: list[UnparsedVersionVar] = [] + found_prereleases: list[UnparsedVersionVar] = [] + + for item in iterable: + parsed_version = _coerce_version(item) + + # Store any item which is a pre-release for later unless we've + # already found a final version or we are accepting prereleases + if parsed_version.is_prerelease and not prereleases: + if not filtered: + found_prereleases.append(item) + else: + filtered.append(item) + + # If we've found no items except for pre-releases, then we'll go + # ahead and use the pre-releases + if not filtered and found_prereleases and prereleases is None: + return iter(found_prereleases) + + return iter(filtered) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py new file mode 100644 index 0000000000000000000000000000000000000000..f5903402abb5a0aed37bb23914f678ef7e34a554 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/tags.py @@ -0,0 +1,617 @@ +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. See the LICENSE file in the root of this repository +# for complete details. + +from __future__ import annotations + +import logging +import platform +import re +import struct +import subprocess +import sys +import sysconfig +from importlib.machinery import EXTENSION_SUFFIXES +from typing import ( + Iterable, + Iterator, + Sequence, + Tuple, + cast, +) + +from . import _manylinux, _musllinux + +logger = logging.getLogger(__name__) + +PythonVersion = Sequence[int] +AppleVersion = Tuple[int, int] + +INTERPRETER_SHORT_NAMES: dict[str, str] = { + "python": "py", # Generic. + "cpython": "cp", + "pypy": "pp", + "ironpython": "ip", + "jython": "jy", +} + + +_32_BIT_INTERPRETER = struct.calcsize("P") == 4 + + +class Tag: + """ + A representation of the tag triple for a wheel. + + Instances are considered immutable and thus are hashable. Equality checking + is also supported. + """ + + __slots__ = ["_abi", "_hash", "_interpreter", "_platform"] + + def __init__(self, interpreter: str, abi: str, platform: str) -> None: + self._interpreter = interpreter.lower() + self._abi = abi.lower() + self._platform = platform.lower() + # The __hash__ of every single element in a Set[Tag] will be evaluated each time + # that a set calls its `.disjoint()` method, which may be called hundreds of + # times when scanning a page of links for packages with tags matching that + # Set[Tag]. Pre-computing the value here produces significant speedups for + # downstream consumers. + self._hash = hash((self._interpreter, self._abi, self._platform)) + + @property + def interpreter(self) -> str: + return self._interpreter + + @property + def abi(self) -> str: + return self._abi + + @property + def platform(self) -> str: + return self._platform + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Tag): + return NotImplemented + + return ( + (self._hash == other._hash) # Short-circuit ASAP for perf reasons. + and (self._platform == other._platform) + and (self._abi == other._abi) + and (self._interpreter == other._interpreter) + ) + + def __hash__(self) -> int: + return self._hash + + def __str__(self) -> str: + return f"{self._interpreter}-{self._abi}-{self._platform}" + + def __repr__(self) -> str: + return f"<{self} @ {id(self)}>" + + +def parse_tag(tag: str) -> frozenset[Tag]: + """ + Parses the provided tag (e.g. `py3-none-any`) into a frozenset of Tag instances. + + Returning a set is required due to the possibility that the tag is a + compressed tag set. + """ + tags = set() + interpreters, abis, platforms = tag.split("-") + for interpreter in interpreters.split("."): + for abi in abis.split("."): + for platform_ in platforms.split("."): + tags.add(Tag(interpreter, abi, platform_)) + return frozenset(tags) + + +def _get_config_var(name: str, warn: bool = False) -> int | str | None: + value: int | str | None = sysconfig.get_config_var(name) + if value is None and warn: + logger.debug( + "Config variable '%s' is unset, Python ABI tag may be incorrect", name + ) + return value + + +def _normalize_string(string: str) -> str: + return string.replace(".", "_").replace("-", "_").replace(" ", "_") + + +def _is_threaded_cpython(abis: list[str]) -> bool: + """ + Determine if the ABI corresponds to a threaded (`--disable-gil`) build. + + The threaded builds are indicated by a "t" in the abiflags. + """ + if len(abis) == 0: + return False + # expect e.g., cp313 + m = re.match(r"cp\d+(.*)", abis[0]) + if not m: + return False + abiflags = m.group(1) + return "t" in abiflags + + +def _abi3_applies(python_version: PythonVersion, threading: bool) -> bool: + """ + Determine if the Python version supports abi3. + + PEP 384 was first implemented in Python 3.2. The threaded (`--disable-gil`) + builds do not support abi3. + """ + return len(python_version) > 1 and tuple(python_version) >= (3, 2) and not threading + + +def _cpython_abis(py_version: PythonVersion, warn: bool = False) -> list[str]: + py_version = tuple(py_version) # To allow for version comparison. + abis = [] + version = _version_nodot(py_version[:2]) + threading = debug = pymalloc = ucs4 = "" + with_debug = _get_config_var("Py_DEBUG", warn) + has_refcount = hasattr(sys, "gettotalrefcount") + # Windows doesn't set Py_DEBUG, so checking for support of debug-compiled + # extension modules is the best option. + # https://github.com/pypa/pip/issues/3383#issuecomment-173267692 + has_ext = "_d.pyd" in EXTENSION_SUFFIXES + if with_debug or (with_debug is None and (has_refcount or has_ext)): + debug = "d" + if py_version >= (3, 13) and _get_config_var("Py_GIL_DISABLED", warn): + threading = "t" + if py_version < (3, 8): + with_pymalloc = _get_config_var("WITH_PYMALLOC", warn) + if with_pymalloc or with_pymalloc is None: + pymalloc = "m" + if py_version < (3, 3): + unicode_size = _get_config_var("Py_UNICODE_SIZE", warn) + if unicode_size == 4 or ( + unicode_size is None and sys.maxunicode == 0x10FFFF + ): + ucs4 = "u" + elif debug: + # Debug builds can also load "normal" extension modules. + # We can also assume no UCS-4 or pymalloc requirement. + abis.append(f"cp{version}{threading}") + abis.insert(0, f"cp{version}{threading}{debug}{pymalloc}{ucs4}") + return abis + + +def cpython_tags( + python_version: PythonVersion | None = None, + abis: Iterable[str] | None = None, + platforms: Iterable[str] | None = None, + *, + warn: bool = False, +) -> Iterator[Tag]: + """ + Yields the tags for a CPython interpreter. + + The tags consist of: + - cp-- + - cp-abi3- + - cp-none- + - cp-abi3- # Older Python versions down to 3.2. + + If python_version only specifies a major version then user-provided ABIs and + the 'none' ABItag will be used. + + If 'abi3' or 'none' are specified in 'abis' then they will be yielded at + their normal position and not at the beginning. + """ + if not python_version: + python_version = sys.version_info[:2] + + interpreter = f"cp{_version_nodot(python_version[:2])}" + + if abis is None: + if len(python_version) > 1: + abis = _cpython_abis(python_version, warn) + else: + abis = [] + abis = list(abis) + # 'abi3' and 'none' are explicitly handled later. + for explicit_abi in ("abi3", "none"): + try: + abis.remove(explicit_abi) + except ValueError: + pass + + platforms = list(platforms or platform_tags()) + for abi in abis: + for platform_ in platforms: + yield Tag(interpreter, abi, platform_) + + threading = _is_threaded_cpython(abis) + use_abi3 = _abi3_applies(python_version, threading) + if use_abi3: + yield from (Tag(interpreter, "abi3", platform_) for platform_ in platforms) + yield from (Tag(interpreter, "none", platform_) for platform_ in platforms) + + if use_abi3: + for minor_version in range(python_version[1] - 1, 1, -1): + for platform_ in platforms: + version = _version_nodot((python_version[0], minor_version)) + interpreter = f"cp{version}" + yield Tag(interpreter, "abi3", platform_) + + +def _generic_abi() -> list[str]: + """ + Return the ABI tag based on EXT_SUFFIX. + """ + # The following are examples of `EXT_SUFFIX`. + # We want to keep the parts which are related to the ABI and remove the + # parts which are related to the platform: + # - linux: '.cpython-310-x86_64-linux-gnu.so' => cp310 + # - mac: '.cpython-310-darwin.so' => cp310 + # - win: '.cp310-win_amd64.pyd' => cp310 + # - win: '.pyd' => cp37 (uses _cpython_abis()) + # - pypy: '.pypy38-pp73-x86_64-linux-gnu.so' => pypy38_pp73 + # - graalpy: '.graalpy-38-native-x86_64-darwin.dylib' + # => graalpy_38_native + + ext_suffix = _get_config_var("EXT_SUFFIX", warn=True) + if not isinstance(ext_suffix, str) or ext_suffix[0] != ".": + raise SystemError("invalid sysconfig.get_config_var('EXT_SUFFIX')") + parts = ext_suffix.split(".") + if len(parts) < 3: + # CPython3.7 and earlier uses ".pyd" on Windows. + return _cpython_abis(sys.version_info[:2]) + soabi = parts[1] + if soabi.startswith("cpython"): + # non-windows + abi = "cp" + soabi.split("-")[1] + elif soabi.startswith("cp"): + # windows + abi = soabi.split("-")[0] + elif soabi.startswith("pypy"): + abi = "-".join(soabi.split("-")[:2]) + elif soabi.startswith("graalpy"): + abi = "-".join(soabi.split("-")[:3]) + elif soabi: + # pyston, ironpython, others? + abi = soabi + else: + return [] + return [_normalize_string(abi)] + + +def generic_tags( + interpreter: str | None = None, + abis: Iterable[str] | None = None, + platforms: Iterable[str] | None = None, + *, + warn: bool = False, +) -> Iterator[Tag]: + """ + Yields the tags for a generic interpreter. + + The tags consist of: + - -- + + The "none" ABI will be added if it was not explicitly provided. + """ + if not interpreter: + interp_name = interpreter_name() + interp_version = interpreter_version(warn=warn) + interpreter = "".join([interp_name, interp_version]) + if abis is None: + abis = _generic_abi() + else: + abis = list(abis) + platforms = list(platforms or platform_tags()) + if "none" not in abis: + abis.append("none") + for abi in abis: + for platform_ in platforms: + yield Tag(interpreter, abi, platform_) + + +def _py_interpreter_range(py_version: PythonVersion) -> Iterator[str]: + """ + Yields Python versions in descending order. + + After the latest version, the major-only version will be yielded, and then + all previous versions of that major version. + """ + if len(py_version) > 1: + yield f"py{_version_nodot(py_version[:2])}" + yield f"py{py_version[0]}" + if len(py_version) > 1: + for minor in range(py_version[1] - 1, -1, -1): + yield f"py{_version_nodot((py_version[0], minor))}" + + +def compatible_tags( + python_version: PythonVersion | None = None, + interpreter: str | None = None, + platforms: Iterable[str] | None = None, +) -> Iterator[Tag]: + """ + Yields the sequence of tags that are compatible with a specific version of Python. + + The tags consist of: + - py*-none- + - -none-any # ... if `interpreter` is provided. + - py*-none-any + """ + if not python_version: + python_version = sys.version_info[:2] + platforms = list(platforms or platform_tags()) + for version in _py_interpreter_range(python_version): + for platform_ in platforms: + yield Tag(version, "none", platform_) + if interpreter: + yield Tag(interpreter, "none", "any") + for version in _py_interpreter_range(python_version): + yield Tag(version, "none", "any") + + +def _mac_arch(arch: str, is_32bit: bool = _32_BIT_INTERPRETER) -> str: + if not is_32bit: + return arch + + if arch.startswith("ppc"): + return "ppc" + + return "i386" + + +def _mac_binary_formats(version: AppleVersion, cpu_arch: str) -> list[str]: + formats = [cpu_arch] + if cpu_arch == "x86_64": + if version < (10, 4): + return [] + formats.extend(["intel", "fat64", "fat32"]) + + elif cpu_arch == "i386": + if version < (10, 4): + return [] + formats.extend(["intel", "fat32", "fat"]) + + elif cpu_arch == "ppc64": + # TODO: Need to care about 32-bit PPC for ppc64 through 10.2? + if version > (10, 5) or version < (10, 4): + return [] + formats.append("fat64") + + elif cpu_arch == "ppc": + if version > (10, 6): + return [] + formats.extend(["fat32", "fat"]) + + if cpu_arch in {"arm64", "x86_64"}: + formats.append("universal2") + + if cpu_arch in {"x86_64", "i386", "ppc64", "ppc", "intel"}: + formats.append("universal") + + return formats + + +def mac_platforms( + version: AppleVersion | None = None, arch: str | None = None +) -> Iterator[str]: + """ + Yields the platform tags for a macOS system. + + The `version` parameter is a two-item tuple specifying the macOS version to + generate platform tags for. The `arch` parameter is the CPU architecture to + generate platform tags for. Both parameters default to the appropriate value + for the current system. + """ + version_str, _, cpu_arch = platform.mac_ver() + if version is None: + version = cast("AppleVersion", tuple(map(int, version_str.split(".")[:2]))) + if version == (10, 16): + # When built against an older macOS SDK, Python will report macOS 10.16 + # instead of the real version. + version_str = subprocess.run( + [ + sys.executable, + "-sS", + "-c", + "import platform; print(platform.mac_ver()[0])", + ], + check=True, + env={"SYSTEM_VERSION_COMPAT": "0"}, + stdout=subprocess.PIPE, + text=True, + ).stdout + version = cast("AppleVersion", tuple(map(int, version_str.split(".")[:2]))) + else: + version = version + if arch is None: + arch = _mac_arch(cpu_arch) + else: + arch = arch + + if (10, 0) <= version and version < (11, 0): + # Prior to Mac OS 11, each yearly release of Mac OS bumped the + # "minor" version number. The major version was always 10. + major_version = 10 + for minor_version in range(version[1], -1, -1): + compat_version = major_version, minor_version + binary_formats = _mac_binary_formats(compat_version, arch) + for binary_format in binary_formats: + yield f"macosx_{major_version}_{minor_version}_{binary_format}" + + if version >= (11, 0): + # Starting with Mac OS 11, each yearly release bumps the major version + # number. The minor versions are now the midyear updates. + minor_version = 0 + for major_version in range(version[0], 10, -1): + compat_version = major_version, minor_version + binary_formats = _mac_binary_formats(compat_version, arch) + for binary_format in binary_formats: + yield f"macosx_{major_version}_{minor_version}_{binary_format}" + + if version >= (11, 0): + # Mac OS 11 on x86_64 is compatible with binaries from previous releases. + # Arm64 support was introduced in 11.0, so no Arm binaries from previous + # releases exist. + # + # However, the "universal2" binary format can have a + # macOS version earlier than 11.0 when the x86_64 part of the binary supports + # that version of macOS. + major_version = 10 + if arch == "x86_64": + for minor_version in range(16, 3, -1): + compat_version = major_version, minor_version + binary_formats = _mac_binary_formats(compat_version, arch) + for binary_format in binary_formats: + yield f"macosx_{major_version}_{minor_version}_{binary_format}" + else: + for minor_version in range(16, 3, -1): + compat_version = major_version, minor_version + binary_format = "universal2" + yield f"macosx_{major_version}_{minor_version}_{binary_format}" + + +def ios_platforms( + version: AppleVersion | None = None, multiarch: str | None = None +) -> Iterator[str]: + """ + Yields the platform tags for an iOS system. + + :param version: A two-item tuple specifying the iOS version to generate + platform tags for. Defaults to the current iOS version. + :param multiarch: The CPU architecture+ABI to generate platform tags for - + (the value used by `sys.implementation._multiarch` e.g., + `arm64_iphoneos` or `x84_64_iphonesimulator`). Defaults to the current + multiarch value. + """ + if version is None: + # if iOS is the current platform, ios_ver *must* be defined. However, + # it won't exist for CPython versions before 3.13, which causes a mypy + # error. + _, release, _, _ = platform.ios_ver() # type: ignore[attr-defined, unused-ignore] + version = cast("AppleVersion", tuple(map(int, release.split(".")[:2]))) + + if multiarch is None: + multiarch = sys.implementation._multiarch + multiarch = multiarch.replace("-", "_") + + ios_platform_template = "ios_{major}_{minor}_{multiarch}" + + # Consider any iOS major.minor version from the version requested, down to + # 12.0. 12.0 is the first iOS version that is known to have enough features + # to support CPython. Consider every possible minor release up to X.9. There + # highest the minor has ever gone is 8 (14.8 and 15.8) but having some extra + # candidates that won't ever match doesn't really hurt, and it saves us from + # having to keep an explicit list of known iOS versions in the code. Return + # the results descending order of version number. + + # If the requested major version is less than 12, there won't be any matches. + if version[0] < 12: + return + + # Consider the actual X.Y version that was requested. + yield ios_platform_template.format( + major=version[0], minor=version[1], multiarch=multiarch + ) + + # Consider every minor version from X.0 to the minor version prior to the + # version requested by the platform. + for minor in range(version[1] - 1, -1, -1): + yield ios_platform_template.format( + major=version[0], minor=minor, multiarch=multiarch + ) + + for major in range(version[0] - 1, 11, -1): + for minor in range(9, -1, -1): + yield ios_platform_template.format( + major=major, minor=minor, multiarch=multiarch + ) + + +def _linux_platforms(is_32bit: bool = _32_BIT_INTERPRETER) -> Iterator[str]: + linux = _normalize_string(sysconfig.get_platform()) + if not linux.startswith("linux_"): + # we should never be here, just yield the sysconfig one and return + yield linux + return + if is_32bit: + if linux == "linux_x86_64": + linux = "linux_i686" + elif linux == "linux_aarch64": + linux = "linux_armv8l" + _, arch = linux.split("_", 1) + archs = {"armv8l": ["armv8l", "armv7l"]}.get(arch, [arch]) + yield from _manylinux.platform_tags(archs) + yield from _musllinux.platform_tags(archs) + for arch in archs: + yield f"linux_{arch}" + + +def _generic_platforms() -> Iterator[str]: + yield _normalize_string(sysconfig.get_platform()) + + +def platform_tags() -> Iterator[str]: + """ + Provides the platform tags for this installation. + """ + if platform.system() == "Darwin": + return mac_platforms() + elif platform.system() == "iOS": + return ios_platforms() + elif platform.system() == "Linux": + return _linux_platforms() + else: + return _generic_platforms() + + +def interpreter_name() -> str: + """ + Returns the name of the running interpreter. + + Some implementations have a reserved, two-letter abbreviation which will + be returned when appropriate. + """ + name = sys.implementation.name + return INTERPRETER_SHORT_NAMES.get(name) or name + + +def interpreter_version(*, warn: bool = False) -> str: + """ + Returns the version of the running interpreter. + """ + version = _get_config_var("py_version_nodot", warn=warn) + if version: + version = str(version) + else: + version = _version_nodot(sys.version_info[:2]) + return version + + +def _version_nodot(version: PythonVersion) -> str: + return "".join(map(str, version)) + + +def sys_tags(*, warn: bool = False) -> Iterator[Tag]: + """ + Returns the sequence of tag triples for the running interpreter. + + The order of the sequence corresponds to priority order for the + interpreter, from most to least important. + """ + + interp_name = interpreter_name() + if interp_name == "cp": + yield from cpython_tags(warn=warn) + else: + yield from generic_tags() + + if interp_name == "pp": + interp = "pp3" + elif interp_name == "cp": + interp = "cp" + interpreter_version(warn=warn) + else: + interp = None + yield from compatible_tags(interpreter=interp) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py new file mode 100644 index 0000000000000000000000000000000000000000..c9bbda20e463b8d9389ecd65f74af33810a02bdd --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging/version.py @@ -0,0 +1,582 @@ +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. See the LICENSE file in the root of this repository +# for complete details. +""" +.. testsetup:: + + from packaging.version import parse, Version +""" + +from __future__ import annotations + +import itertools +import re +from typing import Any, Callable, NamedTuple, SupportsInt, Tuple, Union + +from ._structures import Infinity, InfinityType, NegativeInfinity, NegativeInfinityType + +__all__ = ["VERSION_PATTERN", "InvalidVersion", "Version", "parse"] + +LocalType = Tuple[Union[int, str], ...] + +CmpPrePostDevType = Union[InfinityType, NegativeInfinityType, Tuple[str, int]] +CmpLocalType = Union[ + NegativeInfinityType, + Tuple[Union[Tuple[int, str], Tuple[NegativeInfinityType, Union[int, str]]], ...], +] +CmpKey = Tuple[ + int, + Tuple[int, ...], + CmpPrePostDevType, + CmpPrePostDevType, + CmpPrePostDevType, + CmpLocalType, +] +VersionComparisonMethod = Callable[[CmpKey, CmpKey], bool] + + +class _Version(NamedTuple): + epoch: int + release: tuple[int, ...] + dev: tuple[str, int] | None + pre: tuple[str, int] | None + post: tuple[str, int] | None + local: LocalType | None + + +def parse(version: str) -> Version: + """Parse the given version string. + + >>> parse('1.0.dev1') + + + :param version: The version string to parse. + :raises InvalidVersion: When the version string is not a valid version. + """ + return Version(version) + + +class InvalidVersion(ValueError): + """Raised when a version string is not a valid version. + + >>> Version("invalid") + Traceback (most recent call last): + ... + packaging.version.InvalidVersion: Invalid version: 'invalid' + """ + + +class _BaseVersion: + _key: tuple[Any, ...] + + def __hash__(self) -> int: + return hash(self._key) + + # Please keep the duplicated `isinstance` check + # in the six comparisons hereunder + # unless you find a way to avoid adding overhead function calls. + def __lt__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key < other._key + + def __le__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key <= other._key + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key == other._key + + def __ge__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key >= other._key + + def __gt__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key > other._key + + def __ne__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key != other._key + + +# Deliberately not anchored to the start and end of the string, to make it +# easier for 3rd party code to reuse +_VERSION_PATTERN = r""" + v? + (?: + (?:(?P[0-9]+)!)? # epoch + (?P[0-9]+(?:\.[0-9]+)*) # release segment + (?P
                                          # pre-release
+            [-_\.]?
+            (?Palpha|a|beta|b|preview|pre|c|rc)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+        (?P                                         # post release
+            (?:-(?P[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?Ppost|rev|r)
+                [-_\.]?
+                (?P[0-9]+)?
+            )
+        )?
+        (?P                                          # dev release
+            [-_\.]?
+            (?Pdev)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+    )
+    (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+VERSION_PATTERN = _VERSION_PATTERN
+"""
+A string containing the regular expression used to match a valid version.
+
+The pattern is not anchored at either end, and is intended for embedding in larger
+expressions (for example, matching a version number as part of a file name). The
+regular expression should be compiled with the ``re.VERBOSE`` and ``re.IGNORECASE``
+flags set.
+
+:meta hide-value:
+"""
+
+
+class Version(_BaseVersion):
+    """This class abstracts handling of a project's versions.
+
+    A :class:`Version` instance is comparison aware and can be compared and
+    sorted using the standard Python interfaces.
+
+    >>> v1 = Version("1.0a5")
+    >>> v2 = Version("1.0")
+    >>> v1
+    
+    >>> v2
+    
+    >>> v1 < v2
+    True
+    >>> v1 == v2
+    False
+    >>> v1 > v2
+    False
+    >>> v1 >= v2
+    False
+    >>> v1 <= v2
+    True
+    """
+
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+    _key: CmpKey
+
+    def __init__(self, version: str) -> None:
+        """Initialize a Version object.
+
+        :param version:
+            The string representation of a version which will be parsed and normalized
+            before use.
+        :raises InvalidVersion:
+            If the ``version`` does not conform to PEP 440 in any way then this
+            exception will be raised.
+        """
+
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: {version!r}")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        """A representation of the Version that shows all internal state.
+
+        >>> Version('1.0.0')
+        
+        """
+        return f""
+
+    def __str__(self) -> str:
+        """A string representation of the version that can be round-tripped.
+
+        >>> str(Version("1.0a5"))
+        '1.0a5'
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        """The epoch of the version.
+
+        >>> Version("2.0.0").epoch
+        0
+        >>> Version("1!2.0.0").epoch
+        1
+        """
+        return self._version.epoch
+
+    @property
+    def release(self) -> tuple[int, ...]:
+        """The components of the "release" segment of the version.
+
+        >>> Version("1.2.3").release
+        (1, 2, 3)
+        >>> Version("2.0.0").release
+        (2, 0, 0)
+        >>> Version("1!2.0.0.post0").release
+        (2, 0, 0)
+
+        Includes trailing zeroes but not the epoch or any pre-release / development /
+        post-release suffixes.
+        """
+        return self._version.release
+
+    @property
+    def pre(self) -> tuple[str, int] | None:
+        """The pre-release segment of the version.
+
+        >>> print(Version("1.2.3").pre)
+        None
+        >>> Version("1.2.3a1").pre
+        ('a', 1)
+        >>> Version("1.2.3b1").pre
+        ('b', 1)
+        >>> Version("1.2.3rc1").pre
+        ('rc', 1)
+        """
+        return self._version.pre
+
+    @property
+    def post(self) -> int | None:
+        """The post-release number of the version.
+
+        >>> print(Version("1.2.3").post)
+        None
+        >>> Version("1.2.3.post1").post
+        1
+        """
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> int | None:
+        """The development number of the version.
+
+        >>> print(Version("1.2.3").dev)
+        None
+        >>> Version("1.2.3.dev1").dev
+        1
+        """
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> str | None:
+        """The local version segment of the version.
+
+        >>> print(Version("1.2.3").local)
+        None
+        >>> Version("1.2.3+abc").local
+        'abc'
+        """
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        """The public portion of the version.
+
+        >>> Version("1.2.3").public
+        '1.2.3'
+        >>> Version("1.2.3+abc").public
+        '1.2.3'
+        >>> Version("1!1.2.3dev1+abc").public
+        '1!1.2.3.dev1'
+        """
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        """The "base version" of the version.
+
+        >>> Version("1.2.3").base_version
+        '1.2.3'
+        >>> Version("1.2.3+abc").base_version
+        '1.2.3'
+        >>> Version("1!1.2.3dev1+abc").base_version
+        '1!1.2.3'
+
+        The "base version" is the public version of the project without any pre or post
+        release markers.
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        """Whether this version is a pre-release.
+
+        >>> Version("1.2.3").is_prerelease
+        False
+        >>> Version("1.2.3a1").is_prerelease
+        True
+        >>> Version("1.2.3b1").is_prerelease
+        True
+        >>> Version("1.2.3rc1").is_prerelease
+        True
+        >>> Version("1.2.3dev1").is_prerelease
+        True
+        """
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        """Whether this version is a post-release.
+
+        >>> Version("1.2.3").is_postrelease
+        False
+        >>> Version("1.2.3.post1").is_postrelease
+        True
+        """
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        """Whether this version is a development release.
+
+        >>> Version("1.2.3").is_devrelease
+        False
+        >>> Version("1.2.3.dev1").is_devrelease
+        True
+        """
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        """The first item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").major
+        1
+        """
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        """The second item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").minor
+        2
+        >>> Version("1").minor
+        0
+        """
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        """The third item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").micro
+        3
+        >>> Version("1").micro
+        0
+        """
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+class _TrimmedRelease(Version):
+    @property
+    def release(self) -> tuple[int, ...]:
+        """
+        Release segment without any trailing zeros.
+
+        >>> _TrimmedRelease('1.0.0').release
+        (1,)
+        >>> _TrimmedRelease('0.0').release
+        (0,)
+        """
+        rel = super().release
+        nonzeros = (index for index, val in enumerate(rel) if val)
+        last_nonzero = max(nonzeros, default=0)
+        return rel[: last_nonzero + 1]
+
+
+def _parse_letter_version(
+    letter: str | None, number: str | bytes | SupportsInt | None
+) -> tuple[str, int] | None:
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+
+    assert not letter
+    if number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str | None) -> LocalType | None:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: tuple[int, ...],
+    pre: tuple[str, int] | None,
+    post: tuple[str, int] | None,
+    dev: tuple[str, int] | None,
+    local: LocalType | None,
+) -> CmpKey:
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: CmpPrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: CmpPrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: CmpPrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: CmpLocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34b0fcbd409601aab6b579189824a230dabf25c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__init__.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""CacheControl import Interface.
+
+Make it easy to import from cachecontrol without long namespaces.
+"""
+__author__ = "Eric Larson"
+__email__ = "eric@ionrock.org"
+__version__ = "0.14.0"
+
+from pip._vendor.cachecontrol.adapter import CacheControlAdapter
+from pip._vendor.cachecontrol.controller import CacheController
+from pip._vendor.cachecontrol.wrapper import CacheControl
+
+__all__ = [
+    "__author__",
+    "__email__",
+    "__version__",
+    "CacheControlAdapter",
+    "CacheController",
+    "CacheControl",
+]
+
+import logging
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e682aec58bd50f95bec399fa8485bba460be45e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96fb8f4ddb22cd34ea48c2139b54fe0e5f2e5d65
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/_cmd.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32767184aa03a9fe6c49d453230e6905edb4b5a4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/adapter.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c5192ee4ee7d4b6d871f48b958a625a06b4270
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/controller.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05a187db3536803ea8dd6edbdf7e97925b8cce20
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/heuristics.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eff36a9ed3f700f34844bd6bb0b139c98c5cc43
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/__pycache__/wrapper.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e5634e38559efcb0c61e266f73ff7e8d0b1ad9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/heuristics.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import calendar
+import time
+from datetime import datetime, timedelta, timezone
+from email.utils import formatdate, parsedate, parsedate_tz
+from typing import TYPE_CHECKING, Any, Mapping
+
+if TYPE_CHECKING:
+    from pip._vendor.urllib3 import HTTPResponse
+
+TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"
+
+
+def expire_after(delta: timedelta, date: datetime | None = None) -> datetime:
+    date = date or datetime.now(timezone.utc)
+    return date + delta
+
+
+def datetime_to_header(dt: datetime) -> str:
+    return formatdate(calendar.timegm(dt.timetuple()))
+
+
+class BaseHeuristic:
+    def warning(self, response: HTTPResponse) -> str | None:
+        """
+        Return a valid 1xx warning header value describing the cache
+        adjustments.
+
+        The response is provided too allow warnings like 113
+        http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need
+        to explicitly say response is over 24 hours old.
+        """
+        return '110 - "Response is Stale"'
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        """Update the response headers with any new headers.
+
+        NOTE: This SHOULD always include some Warning header to
+              signify that the response was cached by the client, not
+              by way of the provided headers.
+        """
+        return {}
+
+    def apply(self, response: HTTPResponse) -> HTTPResponse:
+        updated_headers = self.update_headers(response)
+
+        if updated_headers:
+            response.headers.update(updated_headers)
+            warning_header_value = self.warning(response)
+            if warning_header_value is not None:
+                response.headers.update({"Warning": warning_header_value})
+
+        return response
+
+
+class OneDayCache(BaseHeuristic):
+    """
+    Cache the response by providing an expires 1 day in the
+    future.
+    """
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        headers = {}
+
+        if "expires" not in response.headers:
+            date = parsedate(response.headers["date"])
+            expires = expire_after(timedelta(days=1), date=datetime(*date[:6], tzinfo=timezone.utc))  # type: ignore[index,misc]
+            headers["expires"] = datetime_to_header(expires)
+            headers["cache-control"] = "public"
+        return headers
+
+
+class ExpiresAfter(BaseHeuristic):
+    """
+    Cache **all** requests for a defined time period.
+    """
+
+    def __init__(self, **kw: Any) -> None:
+        self.delta = timedelta(**kw)
+
+    def update_headers(self, response: HTTPResponse) -> dict[str, str]:
+        expires = expire_after(self.delta)
+        return {"expires": datetime_to_header(expires), "cache-control": "public"}
+
+    def warning(self, response: HTTPResponse) -> str | None:
+        tmpl = "110 - Automatically cached for %s. Response might be stale"
+        return tmpl % self.delta
+
+
+class LastModified(BaseHeuristic):
+    """
+    If there is no Expires header already, fall back on Last-Modified
+    using the heuristic from
+    http://tools.ietf.org/html/rfc7234#section-4.2.2
+    to calculate a reasonable value.
+
+    Firefox also does something like this per
+    https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ
+    http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397
+    Unlike mozilla we limit this to 24-hr.
+    """
+
+    cacheable_by_default_statuses = {
+        200,
+        203,
+        204,
+        206,
+        300,
+        301,
+        404,
+        405,
+        410,
+        414,
+        501,
+    }
+
+    def update_headers(self, resp: HTTPResponse) -> dict[str, str]:
+        headers: Mapping[str, str] = resp.headers
+
+        if "expires" in headers:
+            return {}
+
+        if "cache-control" in headers and headers["cache-control"] != "public":
+            return {}
+
+        if resp.status not in self.cacheable_by_default_statuses:
+            return {}
+
+        if "date" not in headers or "last-modified" not in headers:
+            return {}
+
+        time_tuple = parsedate_tz(headers["date"])
+        assert time_tuple is not None
+        date = calendar.timegm(time_tuple[:6])
+        last_modified = parsedate(headers["last-modified"])
+        if last_modified is None:
+            return {}
+
+        now = time.time()
+        current_age = max(0, now - date)
+        delta = date - calendar.timegm(last_modified)
+        freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
+        if freshness_lifetime <= current_age:
+            return {}
+
+        expires = date + freshness_lifetime
+        return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))}
+
+    def warning(self, resp: HTTPResponse) -> str | None:
+        return None
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py
new file mode 100644
index 0000000000000000000000000000000000000000..30446ceb3f0235721e435f5fbd53f2e306f078cd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_inspect.py
@@ -0,0 +1,270 @@
+from __future__ import absolute_import
+
+import inspect
+from inspect import cleandoc, getdoc, getfile, isclass, ismodule, signature
+from typing import Any, Collection, Iterable, Optional, Tuple, Type, Union
+
+from .console import Group, RenderableType
+from .control import escape_control_codes
+from .highlighter import ReprHighlighter
+from .jupyter import JupyterMixin
+from .panel import Panel
+from .pretty import Pretty
+from .table import Table
+from .text import Text, TextType
+
+
+def _first_paragraph(doc: str) -> str:
+    """Get the first paragraph from a docstring."""
+    paragraph, _, _ = doc.partition("\n\n")
+    return paragraph
+
+
+class Inspect(JupyterMixin):
+    """A renderable to inspect any Python Object.
+
+    Args:
+        obj (Any): An object to inspect.
+        title (str, optional): Title to display over inspect result, or None use type. Defaults to None.
+        help (bool, optional): Show full help text rather than just first paragraph. Defaults to False.
+        methods (bool, optional): Enable inspection of callables. Defaults to False.
+        docs (bool, optional): Also render doc strings. Defaults to True.
+        private (bool, optional): Show private attributes (beginning with underscore). Defaults to False.
+        dunder (bool, optional): Show attributes starting with double underscore. Defaults to False.
+        sort (bool, optional): Sort attributes alphabetically. Defaults to True.
+        all (bool, optional): Show all attributes. Defaults to False.
+        value (bool, optional): Pretty print value of object. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        obj: Any,
+        *,
+        title: Optional[TextType] = None,
+        help: bool = False,
+        methods: bool = False,
+        docs: bool = True,
+        private: bool = False,
+        dunder: bool = False,
+        sort: bool = True,
+        all: bool = True,
+        value: bool = True,
+    ) -> None:
+        self.highlighter = ReprHighlighter()
+        self.obj = obj
+        self.title = title or self._make_title(obj)
+        if all:
+            methods = private = dunder = True
+        self.help = help
+        self.methods = methods
+        self.docs = docs or help
+        self.private = private or dunder
+        self.dunder = dunder
+        self.sort = sort
+        self.value = value
+
+    def _make_title(self, obj: Any) -> Text:
+        """Make a default title."""
+        title_str = (
+            str(obj)
+            if (isclass(obj) or callable(obj) or ismodule(obj))
+            else str(type(obj))
+        )
+        title_text = self.highlighter(title_str)
+        return title_text
+
+    def __rich__(self) -> Panel:
+        return Panel.fit(
+            Group(*self._render()),
+            title=self.title,
+            border_style="scope.border",
+            padding=(0, 1),
+        )
+
+    def _get_signature(self, name: str, obj: Any) -> Optional[Text]:
+        """Get a signature for a callable."""
+        try:
+            _signature = str(signature(obj)) + ":"
+        except ValueError:
+            _signature = "(...)"
+        except TypeError:
+            return None
+
+        source_filename: Optional[str] = None
+        try:
+            source_filename = getfile(obj)
+        except (OSError, TypeError):
+            # OSError is raised if obj has no source file, e.g. when defined in REPL.
+            pass
+
+        callable_name = Text(name, style="inspect.callable")
+        if source_filename:
+            callable_name.stylize(f"link file://{source_filename}")
+        signature_text = self.highlighter(_signature)
+
+        qualname = name or getattr(obj, "__qualname__", name)
+
+        # If obj is a module, there may be classes (which are callable) to display
+        if inspect.isclass(obj):
+            prefix = "class"
+        elif inspect.iscoroutinefunction(obj):
+            prefix = "async def"
+        else:
+            prefix = "def"
+
+        qual_signature = Text.assemble(
+            (f"{prefix} ", f"inspect.{prefix.replace(' ', '_')}"),
+            (qualname, "inspect.callable"),
+            signature_text,
+        )
+
+        return qual_signature
+
+    def _render(self) -> Iterable[RenderableType]:
+        """Render object."""
+
+        def sort_items(item: Tuple[str, Any]) -> Tuple[bool, str]:
+            key, (_error, value) = item
+            return (callable(value), key.strip("_").lower())
+
+        def safe_getattr(attr_name: str) -> Tuple[Any, Any]:
+            """Get attribute or any exception."""
+            try:
+                return (None, getattr(obj, attr_name))
+            except Exception as error:
+                return (error, None)
+
+        obj = self.obj
+        keys = dir(obj)
+        total_items = len(keys)
+        if not self.dunder:
+            keys = [key for key in keys if not key.startswith("__")]
+        if not self.private:
+            keys = [key for key in keys if not key.startswith("_")]
+        not_shown_count = total_items - len(keys)
+        items = [(key, safe_getattr(key)) for key in keys]
+        if self.sort:
+            items.sort(key=sort_items)
+
+        items_table = Table.grid(padding=(0, 1), expand=False)
+        items_table.add_column(justify="right")
+        add_row = items_table.add_row
+        highlighter = self.highlighter
+
+        if callable(obj):
+            signature = self._get_signature("", obj)
+            if signature is not None:
+                yield signature
+                yield ""
+
+        if self.docs:
+            _doc = self._get_formatted_doc(obj)
+            if _doc is not None:
+                doc_text = Text(_doc, style="inspect.help")
+                doc_text = highlighter(doc_text)
+                yield doc_text
+                yield ""
+
+        if self.value and not (isclass(obj) or callable(obj) or ismodule(obj)):
+            yield Panel(
+                Pretty(obj, indent_guides=True, max_length=10, max_string=60),
+                border_style="inspect.value.border",
+            )
+            yield ""
+
+        for key, (error, value) in items:
+            key_text = Text.assemble(
+                (
+                    key,
+                    "inspect.attr.dunder" if key.startswith("__") else "inspect.attr",
+                ),
+                (" =", "inspect.equals"),
+            )
+            if error is not None:
+                warning = key_text.copy()
+                warning.stylize("inspect.error")
+                add_row(warning, highlighter(repr(error)))
+                continue
+
+            if callable(value):
+                if not self.methods:
+                    continue
+
+                _signature_text = self._get_signature(key, value)
+                if _signature_text is None:
+                    add_row(key_text, Pretty(value, highlighter=highlighter))
+                else:
+                    if self.docs:
+                        docs = self._get_formatted_doc(value)
+                        if docs is not None:
+                            _signature_text.append("\n" if "\n" in docs else " ")
+                            doc = highlighter(docs)
+                            doc.stylize("inspect.doc")
+                            _signature_text.append(doc)
+
+                    add_row(key_text, _signature_text)
+            else:
+                add_row(key_text, Pretty(value, highlighter=highlighter))
+        if items_table.row_count:
+            yield items_table
+        elif not_shown_count:
+            yield Text.from_markup(
+                f"[b cyan]{not_shown_count}[/][i] attribute(s) not shown.[/i] "
+                f"Run [b][magenta]inspect[/]([not b]inspect[/])[/b] for options."
+            )
+
+    def _get_formatted_doc(self, object_: Any) -> Optional[str]:
+        """
+        Extract the docstring of an object, process it and returns it.
+        The processing consists in cleaning up the doctring's indentation,
+        taking only its 1st paragraph if `self.help` is not True,
+        and escape its control codes.
+
+        Args:
+            object_ (Any): the object to get the docstring from.
+
+        Returns:
+            Optional[str]: the processed docstring, or None if no docstring was found.
+        """
+        docs = getdoc(object_)
+        if docs is None:
+            return None
+        docs = cleandoc(docs).strip()
+        if not self.help:
+            docs = _first_paragraph(docs)
+        return escape_control_codes(docs)
+
+
+def get_object_types_mro(obj: Union[object, Type[Any]]) -> Tuple[type, ...]:
+    """Returns the MRO of an object's class, or of the object itself if it's a class."""
+    if not hasattr(obj, "__mro__"):
+        # N.B. we cannot use `if type(obj) is type` here because it doesn't work with
+        # some types of classes, such as the ones that use abc.ABCMeta.
+        obj = type(obj)
+    return getattr(obj, "__mro__", ())
+
+
+def get_object_types_mro_as_strings(obj: object) -> Collection[str]:
+    """
+    Returns the MRO of an object's class as full qualified names, or of the object itself if it's a class.
+
+    Examples:
+        `object_types_mro_as_strings(JSONDecoder)` will return `['json.decoder.JSONDecoder', 'builtins.object']`
+    """
+    return [
+        f'{getattr(type_, "__module__", "")}.{getattr(type_, "__qualname__", "")}'
+        for type_ in get_object_types_mro(obj)
+    ]
+
+
+def is_object_one_of_types(
+    obj: object, fully_qualified_types_names: Collection[str]
+) -> bool:
+    """
+    Returns `True` if the given object's class (or the object itself, if it's a class) has one of the
+    fully qualified names in its MRO.
+    """
+    for type_name in get_object_types_mro_as_strings(obj):
+        if type_name in fully_qualified_types_names:
+            return True
+    return False
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc16c84437a8a34231c44d3f0a331459ddcb0f34
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_log_render.py
@@ -0,0 +1,94 @@
+from datetime import datetime
+from typing import Iterable, List, Optional, TYPE_CHECKING, Union, Callable
+
+
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import Console, ConsoleRenderable, RenderableType
+    from .table import Table
+
+FormatTimeCallable = Callable[[datetime], Text]
+
+
+class LogRender:
+    def __init__(
+        self,
+        show_time: bool = True,
+        show_level: bool = False,
+        show_path: bool = True,
+        time_format: Union[str, FormatTimeCallable] = "[%x %X]",
+        omit_repeated_times: bool = True,
+        level_width: Optional[int] = 8,
+    ) -> None:
+        self.show_time = show_time
+        self.show_level = show_level
+        self.show_path = show_path
+        self.time_format = time_format
+        self.omit_repeated_times = omit_repeated_times
+        self.level_width = level_width
+        self._last_time: Optional[Text] = None
+
+    def __call__(
+        self,
+        console: "Console",
+        renderables: Iterable["ConsoleRenderable"],
+        log_time: Optional[datetime] = None,
+        time_format: Optional[Union[str, FormatTimeCallable]] = None,
+        level: TextType = "",
+        path: Optional[str] = None,
+        line_no: Optional[int] = None,
+        link_path: Optional[str] = None,
+    ) -> "Table":
+        from .containers import Renderables
+        from .table import Table
+
+        output = Table.grid(padding=(0, 1))
+        output.expand = True
+        if self.show_time:
+            output.add_column(style="log.time")
+        if self.show_level:
+            output.add_column(style="log.level", width=self.level_width)
+        output.add_column(ratio=1, style="log.message", overflow="fold")
+        if self.show_path and path:
+            output.add_column(style="log.path")
+        row: List["RenderableType"] = []
+        if self.show_time:
+            log_time = log_time or console.get_datetime()
+            time_format = time_format or self.time_format
+            if callable(time_format):
+                log_time_display = time_format(log_time)
+            else:
+                log_time_display = Text(log_time.strftime(time_format))
+            if log_time_display == self._last_time and self.omit_repeated_times:
+                row.append(Text(" " * len(log_time_display)))
+            else:
+                row.append(log_time_display)
+                self._last_time = log_time_display
+        if self.show_level:
+            row.append(level)
+
+        row.append(Renderables(renderables))
+        if self.show_path and path:
+            path_text = Text()
+            path_text.append(
+                path, style=f"link file://{link_path}" if link_path else ""
+            )
+            if line_no:
+                path_text.append(":")
+                path_text.append(
+                    f"{line_no}",
+                    style=f"link file://{link_path}#{line_no}" if link_path else "",
+                )
+            row.append(path_text)
+
+        output.add_row(*row)
+        return output
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Console
+
+    c = Console()
+    c.print("[on blue]Hello", justify="right")
+    c.log("[on blue]hello", justify="right")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b1082905338a74b72b9de432ece50a456687bc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/_win32_console.py
@@ -0,0 +1,662 @@
+"""Light wrapper around the Win32 Console API - this module should only be imported on Windows
+
+The API that this module wraps is documented at https://docs.microsoft.com/en-us/windows/console/console-functions
+"""
+import ctypes
+import sys
+from typing import Any
+
+windll: Any = None
+if sys.platform == "win32":
+    windll = ctypes.LibraryLoader(ctypes.WinDLL)
+else:
+    raise ImportError(f"{__name__} can only be imported on Windows")
+
+import time
+from ctypes import Structure, byref, wintypes
+from typing import IO, NamedTuple, Type, cast
+
+from pip._vendor.rich.color import ColorSystem
+from pip._vendor.rich.style import Style
+
+STDOUT = -11
+ENABLE_VIRTUAL_TERMINAL_PROCESSING = 4
+
+COORD = wintypes._COORD
+
+
+class LegacyWindowsError(Exception):
+    pass
+
+
+class WindowsCoordinates(NamedTuple):
+    """Coordinates in the Windows Console API are (y, x), not (x, y).
+    This class is intended to prevent that confusion.
+    Rows and columns are indexed from 0.
+    This class can be used in place of wintypes._COORD in arguments and argtypes.
+    """
+
+    row: int
+    col: int
+
+    @classmethod
+    def from_param(cls, value: "WindowsCoordinates") -> COORD:
+        """Converts a WindowsCoordinates into a wintypes _COORD structure.
+        This classmethod is internally called by ctypes to perform the conversion.
+
+        Args:
+            value (WindowsCoordinates): The input coordinates to convert.
+
+        Returns:
+            wintypes._COORD: The converted coordinates struct.
+        """
+        return COORD(value.col, value.row)
+
+
+class CONSOLE_SCREEN_BUFFER_INFO(Structure):
+    _fields_ = [
+        ("dwSize", COORD),
+        ("dwCursorPosition", COORD),
+        ("wAttributes", wintypes.WORD),
+        ("srWindow", wintypes.SMALL_RECT),
+        ("dwMaximumWindowSize", COORD),
+    ]
+
+
+class CONSOLE_CURSOR_INFO(ctypes.Structure):
+    _fields_ = [("dwSize", wintypes.DWORD), ("bVisible", wintypes.BOOL)]
+
+
+_GetStdHandle = windll.kernel32.GetStdHandle
+_GetStdHandle.argtypes = [
+    wintypes.DWORD,
+]
+_GetStdHandle.restype = wintypes.HANDLE
+
+
+def GetStdHandle(handle: int = STDOUT) -> wintypes.HANDLE:
+    """Retrieves a handle to the specified standard device (standard input, standard output, or standard error).
+
+    Args:
+        handle (int): Integer identifier for the handle. Defaults to -11 (stdout).
+
+    Returns:
+        wintypes.HANDLE: The handle
+    """
+    return cast(wintypes.HANDLE, _GetStdHandle(handle))
+
+
+_GetConsoleMode = windll.kernel32.GetConsoleMode
+_GetConsoleMode.argtypes = [wintypes.HANDLE, wintypes.LPDWORD]
+_GetConsoleMode.restype = wintypes.BOOL
+
+
+def GetConsoleMode(std_handle: wintypes.HANDLE) -> int:
+    """Retrieves the current input mode of a console's input buffer
+    or the current output mode of a console screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+
+    Raises:
+        LegacyWindowsError: If any error occurs while calling the Windows console API.
+
+    Returns:
+        int: Value representing the current console mode as documented at
+            https://docs.microsoft.com/en-us/windows/console/getconsolemode#parameters
+    """
+
+    console_mode = wintypes.DWORD()
+    success = bool(_GetConsoleMode(std_handle, console_mode))
+    if not success:
+        raise LegacyWindowsError("Unable to get legacy Windows Console Mode")
+    return console_mode.value
+
+
+_FillConsoleOutputCharacterW = windll.kernel32.FillConsoleOutputCharacterW
+_FillConsoleOutputCharacterW.argtypes = [
+    wintypes.HANDLE,
+    ctypes.c_char,
+    wintypes.DWORD,
+    cast(Type[COORD], WindowsCoordinates),
+    ctypes.POINTER(wintypes.DWORD),
+]
+_FillConsoleOutputCharacterW.restype = wintypes.BOOL
+
+
+def FillConsoleOutputCharacter(
+    std_handle: wintypes.HANDLE,
+    char: str,
+    length: int,
+    start: WindowsCoordinates,
+) -> int:
+    """Writes a character to the console screen buffer a specified number of times, beginning at the specified coordinates.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        char (str): The character to write. Must be a string of length 1.
+        length (int): The number of times to write the character.
+        start (WindowsCoordinates): The coordinates to start writing at.
+
+    Returns:
+        int: The number of characters written.
+    """
+    character = ctypes.c_char(char.encode())
+    num_characters = wintypes.DWORD(length)
+    num_written = wintypes.DWORD(0)
+    _FillConsoleOutputCharacterW(
+        std_handle,
+        character,
+        num_characters,
+        start,
+        byref(num_written),
+    )
+    return num_written.value
+
+
+_FillConsoleOutputAttribute = windll.kernel32.FillConsoleOutputAttribute
+_FillConsoleOutputAttribute.argtypes = [
+    wintypes.HANDLE,
+    wintypes.WORD,
+    wintypes.DWORD,
+    cast(Type[COORD], WindowsCoordinates),
+    ctypes.POINTER(wintypes.DWORD),
+]
+_FillConsoleOutputAttribute.restype = wintypes.BOOL
+
+
+def FillConsoleOutputAttribute(
+    std_handle: wintypes.HANDLE,
+    attributes: int,
+    length: int,
+    start: WindowsCoordinates,
+) -> int:
+    """Sets the character attributes for a specified number of character cells,
+    beginning at the specified coordinates in a screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        attributes (int): Integer value representing the foreground and background colours of the cells.
+        length (int): The number of cells to set the output attribute of.
+        start (WindowsCoordinates): The coordinates of the first cell whose attributes are to be set.
+
+    Returns:
+        int: The number of cells whose attributes were actually set.
+    """
+    num_cells = wintypes.DWORD(length)
+    style_attrs = wintypes.WORD(attributes)
+    num_written = wintypes.DWORD(0)
+    _FillConsoleOutputAttribute(
+        std_handle, style_attrs, num_cells, start, byref(num_written)
+    )
+    return num_written.value
+
+
+_SetConsoleTextAttribute = windll.kernel32.SetConsoleTextAttribute
+_SetConsoleTextAttribute.argtypes = [
+    wintypes.HANDLE,
+    wintypes.WORD,
+]
+_SetConsoleTextAttribute.restype = wintypes.BOOL
+
+
+def SetConsoleTextAttribute(
+    std_handle: wintypes.HANDLE, attributes: wintypes.WORD
+) -> bool:
+    """Set the colour attributes for all text written after this function is called.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        attributes (int): Integer value representing the foreground and background colours.
+
+
+    Returns:
+        bool: True if the attribute was set successfully, otherwise False.
+    """
+    return bool(_SetConsoleTextAttribute(std_handle, attributes))
+
+
+_GetConsoleScreenBufferInfo = windll.kernel32.GetConsoleScreenBufferInfo
+_GetConsoleScreenBufferInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_SCREEN_BUFFER_INFO),
+]
+_GetConsoleScreenBufferInfo.restype = wintypes.BOOL
+
+
+def GetConsoleScreenBufferInfo(
+    std_handle: wintypes.HANDLE,
+) -> CONSOLE_SCREEN_BUFFER_INFO:
+    """Retrieves information about the specified console screen buffer.
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+
+    Returns:
+        CONSOLE_SCREEN_BUFFER_INFO: A CONSOLE_SCREEN_BUFFER_INFO ctype struct contain information about
+            screen size, cursor position, colour attributes, and more."""
+    console_screen_buffer_info = CONSOLE_SCREEN_BUFFER_INFO()
+    _GetConsoleScreenBufferInfo(std_handle, byref(console_screen_buffer_info))
+    return console_screen_buffer_info
+
+
+_SetConsoleCursorPosition = windll.kernel32.SetConsoleCursorPosition
+_SetConsoleCursorPosition.argtypes = [
+    wintypes.HANDLE,
+    cast(Type[COORD], WindowsCoordinates),
+]
+_SetConsoleCursorPosition.restype = wintypes.BOOL
+
+
+def SetConsoleCursorPosition(
+    std_handle: wintypes.HANDLE, coords: WindowsCoordinates
+) -> bool:
+    """Set the position of the cursor in the console screen
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        coords (WindowsCoordinates): The coordinates to move the cursor to.
+
+    Returns:
+        bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleCursorPosition(std_handle, coords))
+
+
+_GetConsoleCursorInfo = windll.kernel32.GetConsoleCursorInfo
+_GetConsoleCursorInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_CURSOR_INFO),
+]
+_GetConsoleCursorInfo.restype = wintypes.BOOL
+
+
+def GetConsoleCursorInfo(
+    std_handle: wintypes.HANDLE, cursor_info: CONSOLE_CURSOR_INFO
+) -> bool:
+    """Get the cursor info - used to get cursor visibility and width
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        cursor_info (CONSOLE_CURSOR_INFO): CONSOLE_CURSOR_INFO ctype struct that receives information
+            about the console's cursor.
+
+    Returns:
+          bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_GetConsoleCursorInfo(std_handle, byref(cursor_info)))
+
+
+_SetConsoleCursorInfo = windll.kernel32.SetConsoleCursorInfo
+_SetConsoleCursorInfo.argtypes = [
+    wintypes.HANDLE,
+    ctypes.POINTER(CONSOLE_CURSOR_INFO),
+]
+_SetConsoleCursorInfo.restype = wintypes.BOOL
+
+
+def SetConsoleCursorInfo(
+    std_handle: wintypes.HANDLE, cursor_info: CONSOLE_CURSOR_INFO
+) -> bool:
+    """Set the cursor info - used for adjusting cursor visibility and width
+
+    Args:
+        std_handle (wintypes.HANDLE): A handle to the console input buffer or the console screen buffer.
+        cursor_info (CONSOLE_CURSOR_INFO): CONSOLE_CURSOR_INFO ctype struct containing the new cursor info.
+
+    Returns:
+          bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleCursorInfo(std_handle, byref(cursor_info)))
+
+
+_SetConsoleTitle = windll.kernel32.SetConsoleTitleW
+_SetConsoleTitle.argtypes = [wintypes.LPCWSTR]
+_SetConsoleTitle.restype = wintypes.BOOL
+
+
+def SetConsoleTitle(title: str) -> bool:
+    """Sets the title of the current console window
+
+    Args:
+        title (str): The new title of the console window.
+
+    Returns:
+        bool: True if the function succeeds, otherwise False.
+    """
+    return bool(_SetConsoleTitle(title))
+
+
+class LegacyWindowsTerm:
+    """This class allows interaction with the legacy Windows Console API. It should only be used in the context
+    of environments where virtual terminal processing is not available. However, if it is used in a Windows environment,
+    the entire API should work.
+
+    Args:
+        file (IO[str]): The file which the Windows Console API HANDLE is retrieved from, defaults to sys.stdout.
+    """
+
+    BRIGHT_BIT = 8
+
+    # Indices are ANSI color numbers, values are the corresponding Windows Console API color numbers
+    ANSI_TO_WINDOWS = [
+        0,  # black                      The Windows colours are defined in wincon.h as follows:
+        4,  # red                         define FOREGROUND_BLUE            0x0001 -- 0000 0001
+        2,  # green                       define FOREGROUND_GREEN           0x0002 -- 0000 0010
+        6,  # yellow                      define FOREGROUND_RED             0x0004 -- 0000 0100
+        1,  # blue                        define FOREGROUND_INTENSITY       0x0008 -- 0000 1000
+        5,  # magenta                     define BACKGROUND_BLUE            0x0010 -- 0001 0000
+        3,  # cyan                        define BACKGROUND_GREEN           0x0020 -- 0010 0000
+        7,  # white                       define BACKGROUND_RED             0x0040 -- 0100 0000
+        8,  # bright black (grey)         define BACKGROUND_INTENSITY       0x0080 -- 1000 0000
+        12,  # bright red
+        10,  # bright green
+        14,  # bright yellow
+        9,  # bright blue
+        13,  # bright magenta
+        11,  # bright cyan
+        15,  # bright white
+    ]
+
+    def __init__(self, file: "IO[str]") -> None:
+        handle = GetStdHandle(STDOUT)
+        self._handle = handle
+        default_text = GetConsoleScreenBufferInfo(handle).wAttributes
+        self._default_text = default_text
+
+        self._default_fore = default_text & 7
+        self._default_back = (default_text >> 4) & 7
+        self._default_attrs = self._default_fore | (self._default_back << 4)
+
+        self._file = file
+        self.write = file.write
+        self.flush = file.flush
+
+    @property
+    def cursor_position(self) -> WindowsCoordinates:
+        """Returns the current position of the cursor (0-based)
+
+        Returns:
+            WindowsCoordinates: The current cursor position.
+        """
+        coord: COORD = GetConsoleScreenBufferInfo(self._handle).dwCursorPosition
+        return WindowsCoordinates(row=cast(int, coord.Y), col=cast(int, coord.X))
+
+    @property
+    def screen_size(self) -> WindowsCoordinates:
+        """Returns the current size of the console screen buffer, in character columns and rows
+
+        Returns:
+            WindowsCoordinates: The width and height of the screen as WindowsCoordinates.
+        """
+        screen_size: COORD = GetConsoleScreenBufferInfo(self._handle).dwSize
+        return WindowsCoordinates(
+            row=cast(int, screen_size.Y), col=cast(int, screen_size.X)
+        )
+
+    def write_text(self, text: str) -> None:
+        """Write text directly to the terminal without any modification of styles
+
+        Args:
+            text (str): The text to write to the console
+        """
+        self.write(text)
+        self.flush()
+
+    def write_styled(self, text: str, style: Style) -> None:
+        """Write styled text to the terminal.
+
+        Args:
+            text (str): The text to write
+            style (Style): The style of the text
+        """
+        color = style.color
+        bgcolor = style.bgcolor
+        if style.reverse:
+            color, bgcolor = bgcolor, color
+
+        if color:
+            fore = color.downgrade(ColorSystem.WINDOWS).number
+            fore = fore if fore is not None else 7  # Default to ANSI 7: White
+            if style.bold:
+                fore = fore | self.BRIGHT_BIT
+            if style.dim:
+                fore = fore & ~self.BRIGHT_BIT
+            fore = self.ANSI_TO_WINDOWS[fore]
+        else:
+            fore = self._default_fore
+
+        if bgcolor:
+            back = bgcolor.downgrade(ColorSystem.WINDOWS).number
+            back = back if back is not None else 0  # Default to ANSI 0: Black
+            back = self.ANSI_TO_WINDOWS[back]
+        else:
+            back = self._default_back
+
+        assert fore is not None
+        assert back is not None
+
+        SetConsoleTextAttribute(
+            self._handle, attributes=ctypes.c_ushort(fore | (back << 4))
+        )
+        self.write_text(text)
+        SetConsoleTextAttribute(self._handle, attributes=self._default_text)
+
+    def move_cursor_to(self, new_position: WindowsCoordinates) -> None:
+        """Set the position of the cursor
+
+        Args:
+            new_position (WindowsCoordinates): The WindowsCoordinates representing the new position of the cursor.
+        """
+        if new_position.col < 0 or new_position.row < 0:
+            return
+        SetConsoleCursorPosition(self._handle, coords=new_position)
+
+    def erase_line(self) -> None:
+        """Erase all content on the line the cursor is currently located at"""
+        screen_size = self.screen_size
+        cursor_position = self.cursor_position
+        cells_to_erase = screen_size.col
+        start_coordinates = WindowsCoordinates(row=cursor_position.row, col=0)
+        FillConsoleOutputCharacter(
+            self._handle, " ", length=cells_to_erase, start=start_coordinates
+        )
+        FillConsoleOutputAttribute(
+            self._handle,
+            self._default_attrs,
+            length=cells_to_erase,
+            start=start_coordinates,
+        )
+
+    def erase_end_of_line(self) -> None:
+        """Erase all content from the cursor position to the end of that line"""
+        cursor_position = self.cursor_position
+        cells_to_erase = self.screen_size.col - cursor_position.col
+        FillConsoleOutputCharacter(
+            self._handle, " ", length=cells_to_erase, start=cursor_position
+        )
+        FillConsoleOutputAttribute(
+            self._handle,
+            self._default_attrs,
+            length=cells_to_erase,
+            start=cursor_position,
+        )
+
+    def erase_start_of_line(self) -> None:
+        """Erase all content from the cursor position to the start of that line"""
+        row, col = self.cursor_position
+        start = WindowsCoordinates(row, 0)
+        FillConsoleOutputCharacter(self._handle, " ", length=col, start=start)
+        FillConsoleOutputAttribute(
+            self._handle, self._default_attrs, length=col, start=start
+        )
+
+    def move_cursor_up(self) -> None:
+        """Move the cursor up a single cell"""
+        cursor_position = self.cursor_position
+        SetConsoleCursorPosition(
+            self._handle,
+            coords=WindowsCoordinates(
+                row=cursor_position.row - 1, col=cursor_position.col
+            ),
+        )
+
+    def move_cursor_down(self) -> None:
+        """Move the cursor down a single cell"""
+        cursor_position = self.cursor_position
+        SetConsoleCursorPosition(
+            self._handle,
+            coords=WindowsCoordinates(
+                row=cursor_position.row + 1,
+                col=cursor_position.col,
+            ),
+        )
+
+    def move_cursor_forward(self) -> None:
+        """Move the cursor forward a single cell. Wrap to the next line if required."""
+        row, col = self.cursor_position
+        if col == self.screen_size.col - 1:
+            row += 1
+            col = 0
+        else:
+            col += 1
+        SetConsoleCursorPosition(
+            self._handle, coords=WindowsCoordinates(row=row, col=col)
+        )
+
+    def move_cursor_to_column(self, column: int) -> None:
+        """Move cursor to the column specified by the zero-based column index, staying on the same row
+
+        Args:
+            column (int): The zero-based column index to move the cursor to.
+        """
+        row, _ = self.cursor_position
+        SetConsoleCursorPosition(self._handle, coords=WindowsCoordinates(row, column))
+
+    def move_cursor_backward(self) -> None:
+        """Move the cursor backward a single cell. Wrap to the previous line if required."""
+        row, col = self.cursor_position
+        if col == 0:
+            row -= 1
+            col = self.screen_size.col - 1
+        else:
+            col -= 1
+        SetConsoleCursorPosition(
+            self._handle, coords=WindowsCoordinates(row=row, col=col)
+        )
+
+    def hide_cursor(self) -> None:
+        """Hide the cursor"""
+        current_cursor_size = self._get_cursor_size()
+        invisible_cursor = CONSOLE_CURSOR_INFO(dwSize=current_cursor_size, bVisible=0)
+        SetConsoleCursorInfo(self._handle, cursor_info=invisible_cursor)
+
+    def show_cursor(self) -> None:
+        """Show the cursor"""
+        current_cursor_size = self._get_cursor_size()
+        visible_cursor = CONSOLE_CURSOR_INFO(dwSize=current_cursor_size, bVisible=1)
+        SetConsoleCursorInfo(self._handle, cursor_info=visible_cursor)
+
+    def set_title(self, title: str) -> None:
+        """Set the title of the terminal window
+
+        Args:
+            title (str): The new title of the console window
+        """
+        assert len(title) < 255, "Console title must be less than 255 characters"
+        SetConsoleTitle(title)
+
+    def _get_cursor_size(self) -> int:
+        """Get the percentage of the character cell that is filled by the cursor"""
+        cursor_info = CONSOLE_CURSOR_INFO()
+        GetConsoleCursorInfo(self._handle, cursor_info=cursor_info)
+        return int(cursor_info.dwSize)
+
+
+if __name__ == "__main__":
+    handle = GetStdHandle()
+
+    from pip._vendor.rich.console import Console
+
+    console = Console()
+
+    term = LegacyWindowsTerm(sys.stdout)
+    term.set_title("Win32 Console Examples")
+
+    style = Style(color="black", bgcolor="red")
+
+    heading = Style.parse("black on green")
+
+    # Check colour output
+    console.rule("Checking colour output")
+    console.print("[on red]on red!")
+    console.print("[blue]blue!")
+    console.print("[yellow]yellow!")
+    console.print("[bold yellow]bold yellow!")
+    console.print("[bright_yellow]bright_yellow!")
+    console.print("[dim bright_yellow]dim bright_yellow!")
+    console.print("[italic cyan]italic cyan!")
+    console.print("[bold white on blue]bold white on blue!")
+    console.print("[reverse bold white on blue]reverse bold white on blue!")
+    console.print("[bold black on cyan]bold black on cyan!")
+    console.print("[black on green]black on green!")
+    console.print("[blue on green]blue on green!")
+    console.print("[white on black]white on black!")
+    console.print("[black on white]black on white!")
+    console.print("[#1BB152 on #DA812D]#1BB152 on #DA812D!")
+
+    # Check cursor movement
+    console.rule("Checking cursor movement")
+    console.print()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("went back and wrapped to prev line")
+    time.sleep(1)
+    term.move_cursor_up()
+    term.write_text("we go up")
+    time.sleep(1)
+    term.move_cursor_down()
+    term.write_text("and down")
+    time.sleep(1)
+    term.move_cursor_up()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("we went up and back 2")
+    time.sleep(1)
+    term.move_cursor_down()
+    term.move_cursor_backward()
+    term.move_cursor_backward()
+    term.write_text("we went down and back 2")
+    time.sleep(1)
+
+    # Check erasing of lines
+    term.hide_cursor()
+    console.print()
+    console.rule("Checking line erasing")
+    console.print("\n...Deleting to the start of the line...")
+    term.write_text("The red arrow shows the cursor location, and direction of erase")
+    time.sleep(1)
+    term.move_cursor_to_column(16)
+    term.write_styled("<", Style.parse("black on red"))
+    term.move_cursor_backward()
+    time.sleep(1)
+    term.erase_start_of_line()
+    time.sleep(1)
+
+    console.print("\n\n...And to the end of the line...")
+    term.write_text("The red arrow shows the cursor location, and direction of erase")
+    time.sleep(1)
+
+    term.move_cursor_to_column(16)
+    term.write_styled(">", Style.parse("black on red"))
+    time.sleep(1)
+    term.erase_end_of_line()
+    time.sleep(1)
+
+    console.print("\n\n...Now the whole line will be erased...")
+    term.write_styled("I'm going to disappear!", style=Style.parse("black on cyan"))
+    time.sleep(1)
+    term.erase_line()
+
+    term.show_cursor()
+    print("\n")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e498efabfab0dcf31cd7731f8f821cc423bc4f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/abc.py
@@ -0,0 +1,33 @@
+from abc import ABC
+
+
+class RichRenderable(ABC):
+    """An abstract base class for Rich renderables.
+
+    Note that there is no need to extend this class, the intended use is to check if an
+    object supports the Rich renderable protocol. For example::
+
+        if isinstance(my_object, RichRenderable):
+            console.print(my_object)
+
+    """
+
+    @classmethod
+    def __subclasshook__(cls, other: type) -> bool:
+        """Check if this class supports the rich render protocol."""
+        return hasattr(other, "__rich_console__") or hasattr(other, "__rich__")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.text import Text
+
+    t = Text()
+    print(isinstance(Text, RichRenderable))
+    print(isinstance(t, RichRenderable))
+
+    class Foo:
+        pass
+
+    f = Foo()
+    print(isinstance(f, RichRenderable))
+    print(isinstance("", RichRenderable))
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py
new file mode 100644
index 0000000000000000000000000000000000000000..022284b57881d8b133aced5b5a843e6447bb4e0b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/bar.py
@@ -0,0 +1,93 @@
+from typing import Optional, Union
+
+from .color import Color
+from .console import Console, ConsoleOptions, RenderResult
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .segment import Segment
+from .style import Style
+
+# There are left-aligned characters for 1/8 to 7/8, but
+# the right-aligned characters exist only for 1/8 and 4/8.
+BEGIN_BLOCK_ELEMENTS = ["█", "█", "█", "▐", "▐", "▐", "▕", "▕"]
+END_BLOCK_ELEMENTS = [" ", "▏", "▎", "▍", "▌", "▋", "▊", "▉"]
+FULL_BLOCK = "█"
+
+
+class Bar(JupyterMixin):
+    """Renders a solid block bar.
+
+    Args:
+        size (float): Value for the end of the bar.
+        begin (float): Begin point (between 0 and size, inclusive).
+        end (float): End point (between 0 and size, inclusive).
+        width (int, optional): Width of the bar, or ``None`` for maximum width. Defaults to None.
+        color (Union[Color, str], optional): Color of the bar. Defaults to "default".
+        bgcolor (Union[Color, str], optional): Color of bar background. Defaults to "default".
+    """
+
+    def __init__(
+        self,
+        size: float,
+        begin: float,
+        end: float,
+        *,
+        width: Optional[int] = None,
+        color: Union[Color, str] = "default",
+        bgcolor: Union[Color, str] = "default",
+    ):
+        self.size = size
+        self.begin = max(begin, 0)
+        self.end = min(end, size)
+        self.width = width
+        self.style = Style(color=color, bgcolor=bgcolor)
+
+    def __repr__(self) -> str:
+        return f"Bar({self.size}, {self.begin}, {self.end})"
+
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        width = min(
+            self.width if self.width is not None else options.max_width,
+            options.max_width,
+        )
+
+        if self.begin >= self.end:
+            yield Segment(" " * width, self.style)
+            yield Segment.line()
+            return
+
+        prefix_complete_eights = int(width * 8 * self.begin / self.size)
+        prefix_bar_count = prefix_complete_eights // 8
+        prefix_eights_count = prefix_complete_eights % 8
+
+        body_complete_eights = int(width * 8 * self.end / self.size)
+        body_bar_count = body_complete_eights // 8
+        body_eights_count = body_complete_eights % 8
+
+        # When start and end fall into the same cell, we ideally should render
+        # a symbol that's "center-aligned", but there is no good symbol in Unicode.
+        # In this case, we fall back to right-aligned block symbol for simplicity.
+
+        prefix = " " * prefix_bar_count
+        if prefix_eights_count:
+            prefix += BEGIN_BLOCK_ELEMENTS[prefix_eights_count]
+
+        body = FULL_BLOCK * body_bar_count
+        if body_eights_count:
+            body += END_BLOCK_ELEMENTS[body_eights_count]
+
+        suffix = " " * (width - len(body))
+
+        yield Segment(prefix + body[len(prefix) :] + suffix, self.style)
+        yield Segment.line()
+
+    def __rich_measure__(
+        self, console: Console, options: ConsoleOptions
+    ) -> Measurement:
+        return (
+            Measurement(self.width, self.width)
+            if self.width is not None
+            else Measurement(4, options.max_width)
+        )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bcbe53ef59373c608e62ea285536f8b22b47ecb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/errors.py
@@ -0,0 +1,34 @@
+class ConsoleError(Exception):
+    """An error in console operation."""
+
+
+class StyleError(Exception):
+    """An error in styles."""
+
+
+class StyleSyntaxError(ConsoleError):
+    """Style was badly formatted."""
+
+
+class MissingStyle(StyleError):
+    """No such style."""
+
+
+class StyleStackError(ConsoleError):
+    """Style stack is invalid."""
+
+
+class NotRenderableError(ConsoleError):
+    """Object is not renderable."""
+
+
+class MarkupError(ConsoleError):
+    """Markup was badly formatted."""
+
+
+class LiveError(ConsoleError):
+    """Error related to Live display."""
+
+
+class NoAltScreen(ConsoleError):
+    """Alt screen mode was required."""
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f118e20103174993b865cfb43ac6b6e00296a4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/filesize.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+"""Functions for reporting filesizes. Borrowed from https://github.com/PyFilesystem/pyfilesystem2
+
+The functions declared in this module should cover the different
+use cases needed to generate a string representation of a file size
+using several different units. Since there are many standards regarding
+file size units, three different functions have been implemented.
+
+See Also:
+    * `Wikipedia: Binary prefix `_
+
+"""
+
+__all__ = ["decimal"]
+
+from typing import Iterable, List, Optional, Tuple
+
+
+def _to_str(
+    size: int,
+    suffixes: Iterable[str],
+    base: int,
+    *,
+    precision: Optional[int] = 1,
+    separator: Optional[str] = " ",
+) -> str:
+    if size == 1:
+        return "1 byte"
+    elif size < base:
+        return "{:,} bytes".format(size)
+
+    for i, suffix in enumerate(suffixes, 2):  # noqa: B007
+        unit = base**i
+        if size < unit:
+            break
+    return "{:,.{precision}f}{separator}{}".format(
+        (base * size / unit),
+        suffix,
+        precision=precision,
+        separator=separator,
+    )
+
+
+def pick_unit_and_suffix(size: int, suffixes: List[str], base: int) -> Tuple[int, str]:
+    """Pick a suffix and base for the given size."""
+    for i, suffix in enumerate(suffixes):
+        unit = base**i
+        if size < unit * base:
+            break
+    return unit, suffix
+
+
+def decimal(
+    size: int,
+    *,
+    precision: Optional[int] = 1,
+    separator: Optional[str] = " ",
+) -> str:
+    """Convert a filesize in to a string (powers of 1000, SI prefixes).
+
+    In this convention, ``1000 B = 1 kB``.
+
+    This is typically the format used to advertise the storage
+    capacity of USB flash drives and the like (*256 MB* meaning
+    actually a storage capacity of more than *256 000 000 B*),
+    or used by **Mac OS X** since v10.6 to report file sizes.
+
+    Arguments:
+        int (size): A file size.
+        int (precision): The number of decimal places to include (default = 1).
+        str (separator): The string to separate the value from the units (default = " ").
+
+    Returns:
+        `str`: A string containing a abbreviated file size and units.
+
+    Example:
+        >>> filesize.decimal(30000)
+        '30.0 kB'
+        >>> filesize.decimal(30000, precision=2, separator="")
+        '30.00kB'
+
+    """
+    return _to_str(
+        size,
+        ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"),
+        1000,
+        precision=precision,
+        separator=separator,
+    )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6171878f823183ee8f77195b3e944be222006dc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/markup.py
@@ -0,0 +1,251 @@
+import re
+from ast import literal_eval
+from operator import attrgetter
+from typing import Callable, Iterable, List, Match, NamedTuple, Optional, Tuple, Union
+
+from ._emoji_replace import _emoji_replace
+from .emoji import EmojiVariant
+from .errors import MarkupError
+from .style import Style
+from .text import Span, Text
+
+RE_TAGS = re.compile(
+    r"""((\\*)\[([a-z#/@][^[]*?)])""",
+    re.VERBOSE,
+)
+
+RE_HANDLER = re.compile(r"^([\w.]*?)(\(.*?\))?$")
+
+
+class Tag(NamedTuple):
+    """A tag in console markup."""
+
+    name: str
+    """The tag name. e.g. 'bold'."""
+    parameters: Optional[str]
+    """Any additional parameters after the name."""
+
+    def __str__(self) -> str:
+        return (
+            self.name if self.parameters is None else f"{self.name} {self.parameters}"
+        )
+
+    @property
+    def markup(self) -> str:
+        """Get the string representation of this tag."""
+        return (
+            f"[{self.name}]"
+            if self.parameters is None
+            else f"[{self.name}={self.parameters}]"
+        )
+
+
+_ReStringMatch = Match[str]  # regex match object
+_ReSubCallable = Callable[[_ReStringMatch], str]  # Callable invoked by re.sub
+_EscapeSubMethod = Callable[[_ReSubCallable, str], str]  # Sub method of a compiled re
+
+
+def escape(
+    markup: str,
+    _escape: _EscapeSubMethod = re.compile(r"(\\*)(\[[a-z#/@][^[]*?])").sub,
+) -> str:
+    """Escapes text so that it won't be interpreted as markup.
+
+    Args:
+        markup (str): Content to be inserted in to markup.
+
+    Returns:
+        str: Markup with square brackets escaped.
+    """
+
+    def escape_backslashes(match: Match[str]) -> str:
+        """Called by re.sub replace matches."""
+        backslashes, text = match.groups()
+        return f"{backslashes}{backslashes}\\{text}"
+
+    markup = _escape(escape_backslashes, markup)
+    if markup.endswith("\\") and not markup.endswith("\\\\"):
+        return markup + "\\"
+
+    return markup
+
+
+def _parse(markup: str) -> Iterable[Tuple[int, Optional[str], Optional[Tag]]]:
+    """Parse markup in to an iterable of tuples of (position, text, tag).
+
+    Args:
+        markup (str): A string containing console markup
+
+    """
+    position = 0
+    _divmod = divmod
+    _Tag = Tag
+    for match in RE_TAGS.finditer(markup):
+        full_text, escapes, tag_text = match.groups()
+        start, end = match.span()
+        if start > position:
+            yield start, markup[position:start], None
+        if escapes:
+            backslashes, escaped = _divmod(len(escapes), 2)
+            if backslashes:
+                # Literal backslashes
+                yield start, "\\" * backslashes, None
+                start += backslashes * 2
+            if escaped:
+                # Escape of tag
+                yield start, full_text[len(escapes) :], None
+                position = end
+                continue
+        text, equals, parameters = tag_text.partition("=")
+        yield start, None, _Tag(text, parameters if equals else None)
+        position = end
+    if position < len(markup):
+        yield position, markup[position:], None
+
+
+def render(
+    markup: str,
+    style: Union[str, Style] = "",
+    emoji: bool = True,
+    emoji_variant: Optional[EmojiVariant] = None,
+) -> Text:
+    """Render console markup in to a Text instance.
+
+    Args:
+        markup (str): A string containing console markup.
+        style: (Union[str, Style]): The style to use.
+        emoji (bool, optional): Also render emoji code. Defaults to True.
+        emoji_variant (str, optional): Optional emoji variant, either "text" or "emoji". Defaults to None.
+
+
+    Raises:
+        MarkupError: If there is a syntax error in the markup.
+
+    Returns:
+        Text: A test instance.
+    """
+    emoji_replace = _emoji_replace
+    if "[" not in markup:
+        return Text(
+            emoji_replace(markup, default_variant=emoji_variant) if emoji else markup,
+            style=style,
+        )
+    text = Text(style=style)
+    append = text.append
+    normalize = Style.normalize
+
+    style_stack: List[Tuple[int, Tag]] = []
+    pop = style_stack.pop
+
+    spans: List[Span] = []
+    append_span = spans.append
+
+    _Span = Span
+    _Tag = Tag
+
+    def pop_style(style_name: str) -> Tuple[int, Tag]:
+        """Pop tag matching given style name."""
+        for index, (_, tag) in enumerate(reversed(style_stack), 1):
+            if tag.name == style_name:
+                return pop(-index)
+        raise KeyError(style_name)
+
+    for position, plain_text, tag in _parse(markup):
+        if plain_text is not None:
+            # Handle open brace escapes, where the brace is not part of a tag.
+            plain_text = plain_text.replace("\\[", "[")
+            append(emoji_replace(plain_text) if emoji else plain_text)
+        elif tag is not None:
+            if tag.name.startswith("/"):  # Closing tag
+                style_name = tag.name[1:].strip()
+
+                if style_name:  # explicit close
+                    style_name = normalize(style_name)
+                    try:
+                        start, open_tag = pop_style(style_name)
+                    except KeyError:
+                        raise MarkupError(
+                            f"closing tag '{tag.markup}' at position {position} doesn't match any open tag"
+                        ) from None
+                else:  # implicit close
+                    try:
+                        start, open_tag = pop()
+                    except IndexError:
+                        raise MarkupError(
+                            f"closing tag '[/]' at position {position} has nothing to close"
+                        ) from None
+
+                if open_tag.name.startswith("@"):
+                    if open_tag.parameters:
+                        handler_name = ""
+                        parameters = open_tag.parameters.strip()
+                        handler_match = RE_HANDLER.match(parameters)
+                        if handler_match is not None:
+                            handler_name, match_parameters = handler_match.groups()
+                            parameters = (
+                                "()" if match_parameters is None else match_parameters
+                            )
+
+                        try:
+                            meta_params = literal_eval(parameters)
+                        except SyntaxError as error:
+                            raise MarkupError(
+                                f"error parsing {parameters!r} in {open_tag.parameters!r}; {error.msg}"
+                            )
+                        except Exception as error:
+                            raise MarkupError(
+                                f"error parsing {open_tag.parameters!r}; {error}"
+                            ) from None
+
+                        if handler_name:
+                            meta_params = (
+                                handler_name,
+                                meta_params
+                                if isinstance(meta_params, tuple)
+                                else (meta_params,),
+                            )
+
+                    else:
+                        meta_params = ()
+
+                    append_span(
+                        _Span(
+                            start, len(text), Style(meta={open_tag.name: meta_params})
+                        )
+                    )
+                else:
+                    append_span(_Span(start, len(text), str(open_tag)))
+
+            else:  # Opening tag
+                normalized_tag = _Tag(normalize(tag.name), tag.parameters)
+                style_stack.append((len(text), normalized_tag))
+
+    text_length = len(text)
+    while style_stack:
+        start, tag = style_stack.pop()
+        style = str(tag)
+        if style:
+            append_span(_Span(start, text_length, style))
+
+    text.spans = sorted(spans[::-1], key=attrgetter("start"))
+    return text
+
+
+if __name__ == "__main__":  # pragma: no cover
+    MARKUP = [
+        "[red]Hello World[/red]",
+        "[magenta]Hello [b]World[/b]",
+        "[bold]Bold[italic] bold and italic [/bold]italic[/italic]",
+        "Click [link=https://www.willmcgugan.com]here[/link] to visit my Blog",
+        ":warning-emoji: [bold red blink] DANGER![/]",
+    ]
+
+    from pip._vendor.rich import print
+    from pip._vendor.rich.table import Table
+
+    grid = Table("Markup", "Result", padding=(0, 1))
+
+    for markup in MARKUP:
+        grid.add_row(Text(markup), markup)
+
+    print(grid)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f4c84cf0b21d1bb518a40b22f39716a189f2fa
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/panel.py
@@ -0,0 +1,312 @@
+from typing import TYPE_CHECKING, Optional
+
+from .align import AlignMethod
+from .box import ROUNDED, Box
+from .cells import cell_len
+from .jupyter import JupyterMixin
+from .measure import Measurement, measure_renderables
+from .padding import Padding, PaddingDimensions
+from .segment import Segment
+from .style import Style, StyleType
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import Console, ConsoleOptions, RenderableType, RenderResult
+
+
+class Panel(JupyterMixin):
+    """A console renderable that draws a border around its contents.
+
+    Example:
+        >>> console.print(Panel("Hello, World!"))
+
+    Args:
+        renderable (RenderableType): A console renderable object.
+        box (Box, optional): A Box instance that defines the look of the border (see :ref:`appendix_box`.
+            Defaults to box.ROUNDED.
+        safe_box (bool, optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        expand (bool, optional): If True the panel will stretch to fill the console
+            width, otherwise it will be sized to fit the contents. Defaults to True.
+        style (str, optional): The style of the panel (border and contents). Defaults to "none".
+        border_style (str, optional): The style of the border. Defaults to "none".
+        width (Optional[int], optional): Optional width of panel. Defaults to None to auto-detect.
+        height (Optional[int], optional): Optional height of panel. Defaults to None to auto-detect.
+        padding (Optional[PaddingDimensions]): Optional padding around renderable. Defaults to 0.
+        highlight (bool, optional): Enable automatic highlighting of panel title (if str). Defaults to False.
+    """
+
+    def __init__(
+        self,
+        renderable: "RenderableType",
+        box: Box = ROUNDED,
+        *,
+        title: Optional[TextType] = None,
+        title_align: AlignMethod = "center",
+        subtitle: Optional[TextType] = None,
+        subtitle_align: AlignMethod = "center",
+        safe_box: Optional[bool] = None,
+        expand: bool = True,
+        style: StyleType = "none",
+        border_style: StyleType = "none",
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        padding: PaddingDimensions = (0, 1),
+        highlight: bool = False,
+    ) -> None:
+        self.renderable = renderable
+        self.box = box
+        self.title = title
+        self.title_align: AlignMethod = title_align
+        self.subtitle = subtitle
+        self.subtitle_align = subtitle_align
+        self.safe_box = safe_box
+        self.expand = expand
+        self.style = style
+        self.border_style = border_style
+        self.width = width
+        self.height = height
+        self.padding = padding
+        self.highlight = highlight
+
+    @classmethod
+    def fit(
+        cls,
+        renderable: "RenderableType",
+        box: Box = ROUNDED,
+        *,
+        title: Optional[TextType] = None,
+        title_align: AlignMethod = "center",
+        subtitle: Optional[TextType] = None,
+        subtitle_align: AlignMethod = "center",
+        safe_box: Optional[bool] = None,
+        style: StyleType = "none",
+        border_style: StyleType = "none",
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        padding: PaddingDimensions = (0, 1),
+        highlight: bool = False,
+    ) -> "Panel":
+        """An alternative constructor that sets expand=False."""
+        return cls(
+            renderable,
+            box,
+            title=title,
+            title_align=title_align,
+            subtitle=subtitle,
+            subtitle_align=subtitle_align,
+            safe_box=safe_box,
+            style=style,
+            border_style=border_style,
+            width=width,
+            height=height,
+            padding=padding,
+            highlight=highlight,
+            expand=False,
+        )
+
+    @property
+    def _title(self) -> Optional[Text]:
+        if self.title:
+            title_text = (
+                Text.from_markup(self.title)
+                if isinstance(self.title, str)
+                else self.title.copy()
+            )
+            title_text.end = ""
+            title_text.plain = title_text.plain.replace("\n", " ")
+            title_text.no_wrap = True
+            title_text.expand_tabs()
+            title_text.pad(1)
+            return title_text
+        return None
+
+    @property
+    def _subtitle(self) -> Optional[Text]:
+        if self.subtitle:
+            subtitle_text = (
+                Text.from_markup(self.subtitle)
+                if isinstance(self.subtitle, str)
+                else self.subtitle.copy()
+            )
+            subtitle_text.end = ""
+            subtitle_text.plain = subtitle_text.plain.replace("\n", " ")
+            subtitle_text.no_wrap = True
+            subtitle_text.expand_tabs()
+            subtitle_text.pad(1)
+            return subtitle_text
+        return None
+
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        _padding = Padding.unpack(self.padding)
+        renderable = (
+            Padding(self.renderable, _padding) if any(_padding) else self.renderable
+        )
+        style = console.get_style(self.style)
+        border_style = style + console.get_style(self.border_style)
+        width = (
+            options.max_width
+            if self.width is None
+            else min(options.max_width, self.width)
+        )
+
+        safe_box: bool = console.safe_box if self.safe_box is None else self.safe_box
+        box = self.box.substitute(options, safe=safe_box)
+
+        def align_text(
+            text: Text, width: int, align: str, character: str, style: Style
+        ) -> Text:
+            """Gets new aligned text.
+
+            Args:
+                text (Text): Title or subtitle text.
+                width (int): Desired width.
+                align (str): Alignment.
+                character (str): Character for alignment.
+                style (Style): Border style
+
+            Returns:
+                Text: New text instance
+            """
+            text = text.copy()
+            text.truncate(width)
+            excess_space = width - cell_len(text.plain)
+            if excess_space:
+                if align == "left":
+                    return Text.assemble(
+                        text,
+                        (character * excess_space, style),
+                        no_wrap=True,
+                        end="",
+                    )
+                elif align == "center":
+                    left = excess_space // 2
+                    return Text.assemble(
+                        (character * left, style),
+                        text,
+                        (character * (excess_space - left), style),
+                        no_wrap=True,
+                        end="",
+                    )
+                else:
+                    return Text.assemble(
+                        (character * excess_space, style),
+                        text,
+                        no_wrap=True,
+                        end="",
+                    )
+            return text
+
+        title_text = self._title
+        if title_text is not None:
+            title_text.stylize_before(border_style)
+
+        child_width = (
+            width - 2
+            if self.expand
+            else console.measure(
+                renderable, options=options.update_width(width - 2)
+            ).maximum
+        )
+        child_height = self.height or options.height or None
+        if child_height:
+            child_height -= 2
+        if title_text is not None:
+            child_width = min(
+                options.max_width - 2, max(child_width, title_text.cell_len + 2)
+            )
+
+        width = child_width + 2
+        child_options = options.update(
+            width=child_width, height=child_height, highlight=self.highlight
+        )
+        lines = console.render_lines(renderable, child_options, style=style)
+
+        line_start = Segment(box.mid_left, border_style)
+        line_end = Segment(f"{box.mid_right}", border_style)
+        new_line = Segment.line()
+        if title_text is None or width <= 4:
+            yield Segment(box.get_top([width - 2]), border_style)
+        else:
+            title_text = align_text(
+                title_text,
+                width - 4,
+                self.title_align,
+                box.top,
+                border_style,
+            )
+            yield Segment(box.top_left + box.top, border_style)
+            yield from console.render(title_text, child_options.update_width(width - 4))
+            yield Segment(box.top + box.top_right, border_style)
+
+        yield new_line
+        for line in lines:
+            yield line_start
+            yield from line
+            yield line_end
+            yield new_line
+
+        subtitle_text = self._subtitle
+        if subtitle_text is not None:
+            subtitle_text.stylize_before(border_style)
+
+        if subtitle_text is None or width <= 4:
+            yield Segment(box.get_bottom([width - 2]), border_style)
+        else:
+            subtitle_text = align_text(
+                subtitle_text,
+                width - 4,
+                self.subtitle_align,
+                box.bottom,
+                border_style,
+            )
+            yield Segment(box.bottom_left + box.bottom, border_style)
+            yield from console.render(
+                subtitle_text, child_options.update_width(width - 4)
+            )
+            yield Segment(box.bottom + box.bottom_right, border_style)
+
+        yield new_line
+
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "Measurement":
+        _title = self._title
+        _, right, _, left = Padding.unpack(self.padding)
+        padding = left + right
+        renderables = [self.renderable, _title] if _title else [self.renderable]
+
+        if self.width is None:
+            width = (
+                measure_renderables(
+                    console,
+                    options.update_width(options.max_width - padding - 2),
+                    renderables,
+                ).maximum
+                + padding
+                + 2
+            )
+        else:
+            width = self.width
+        return Measurement(width, width)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from .console import Console
+
+    c = Console()
+
+    from .box import DOUBLE, ROUNDED
+    from .padding import Padding
+
+    p = Panel(
+        "Hello, World!",
+        title="rich.Panel",
+        style="white on blue",
+        box=DOUBLE,
+        padding=1,
+    )
+
+    c.print()
+    c.print(p)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c718ebf5906c7aaca9fb14ac34dd80ae8bc01e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/table.py
@@ -0,0 +1,1000 @@
+from dataclasses import dataclass, field, replace
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+from . import box, errors
+from ._loop import loop_first_last, loop_last
+from ._pick import pick_bool
+from ._ratio import ratio_distribute, ratio_reduce
+from .align import VerticalAlignMethod
+from .jupyter import JupyterMixin
+from .measure import Measurement
+from .padding import Padding, PaddingDimensions
+from .protocol import is_renderable
+from .segment import Segment
+from .style import Style, StyleType
+from .text import Text, TextType
+
+if TYPE_CHECKING:
+    from .console import (
+        Console,
+        ConsoleOptions,
+        JustifyMethod,
+        OverflowMethod,
+        RenderableType,
+        RenderResult,
+    )
+
+
+@dataclass
+class Column:
+    """Defines a column within a ~Table.
+
+    Args:
+        title (Union[str, Text], optional): The title of the table rendered at the top. Defaults to None.
+        caption (Union[str, Text], optional): The table caption rendered below. Defaults to None.
+        width (int, optional): The width in characters of the table, or ``None`` to automatically fit. Defaults to None.
+        min_width (Optional[int], optional): The minimum width of the table, or ``None`` for no minimum. Defaults to None.
+        box (box.Box, optional): One of the constants in box.py used to draw the edges (see :ref:`appendix_box`), or ``None`` for no box lines. Defaults to box.HEAVY_HEAD.
+        safe_box (Optional[bool], optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        padding (PaddingDimensions, optional): Padding for cells (top, right, bottom, left). Defaults to (0, 1).
+        collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to False.
+        pad_edge (bool, optional): Enable padding of edge cells. Defaults to True.
+        expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+        show_header (bool, optional): Show a header row. Defaults to True.
+        show_footer (bool, optional): Show a footer row. Defaults to False.
+        show_edge (bool, optional): Draw a box around the outside of the table. Defaults to True.
+        show_lines (bool, optional): Draw lines between every row. Defaults to False.
+        leading (bool, optional): Number of blank lines between rows (precludes ``show_lines``). Defaults to 0.
+        style (Union[str, Style], optional): Default style for the table. Defaults to "none".
+        row_styles (List[Union, str], optional): Optional list of row styles, if more than one style is given then the styles will alternate. Defaults to None.
+        header_style (Union[str, Style], optional): Style of the header. Defaults to "table.header".
+        footer_style (Union[str, Style], optional): Style of the footer. Defaults to "table.footer".
+        border_style (Union[str, Style], optional): Style of the border. Defaults to None.
+        title_style (Union[str, Style], optional): Style of the title. Defaults to None.
+        caption_style (Union[str, Style], optional): Style of the caption. Defaults to None.
+        title_justify (str, optional): Justify method for title. Defaults to "center".
+        caption_justify (str, optional): Justify method for caption. Defaults to "center".
+        highlight (bool, optional): Highlight cell contents (if str). Defaults to False.
+    """
+
+    header: "RenderableType" = ""
+    """RenderableType: Renderable for the header (typically a string)"""
+
+    footer: "RenderableType" = ""
+    """RenderableType: Renderable for the footer (typically a string)"""
+
+    header_style: StyleType = ""
+    """StyleType: The style of the header."""
+
+    footer_style: StyleType = ""
+    """StyleType: The style of the footer."""
+
+    style: StyleType = ""
+    """StyleType: The style of the column."""
+
+    justify: "JustifyMethod" = "left"
+    """str: How to justify text within the column ("left", "center", "right", or "full")"""
+
+    vertical: "VerticalAlignMethod" = "top"
+    """str: How to vertically align content ("top", "middle", or "bottom")"""
+
+    overflow: "OverflowMethod" = "ellipsis"
+    """str: Overflow method."""
+
+    width: Optional[int] = None
+    """Optional[int]: Width of the column, or ``None`` (default) to auto calculate width."""
+
+    min_width: Optional[int] = None
+    """Optional[int]: Minimum width of column, or ``None`` for no minimum. Defaults to None."""
+
+    max_width: Optional[int] = None
+    """Optional[int]: Maximum width of column, or ``None`` for no maximum. Defaults to None."""
+
+    ratio: Optional[int] = None
+    """Optional[int]: Ratio to use when calculating column width, or ``None`` (default) to adapt to column contents."""
+
+    no_wrap: bool = False
+    """bool: Prevent wrapping of text within the column. Defaults to ``False``."""
+
+    _index: int = 0
+    """Index of column."""
+
+    _cells: List["RenderableType"] = field(default_factory=list)
+
+    def copy(self) -> "Column":
+        """Return a copy of this Column."""
+        return replace(self, _cells=[])
+
+    @property
+    def cells(self) -> Iterable["RenderableType"]:
+        """Get all cells in the column, not including header."""
+        yield from self._cells
+
+    @property
+    def flexible(self) -> bool:
+        """Check if this column is flexible."""
+        return self.ratio is not None
+
+
+@dataclass
+class Row:
+    """Information regarding a row."""
+
+    style: Optional[StyleType] = None
+    """Style to apply to row."""
+
+    end_section: bool = False
+    """Indicated end of section, which will force a line beneath the row."""
+
+
+class _Cell(NamedTuple):
+    """A single cell in a table."""
+
+    style: StyleType
+    """Style to apply to cell."""
+    renderable: "RenderableType"
+    """Cell renderable."""
+    vertical: VerticalAlignMethod
+    """Cell vertical alignment."""
+
+
+class Table(JupyterMixin):
+    """A console renderable to draw a table.
+
+    Args:
+        *headers (Union[Column, str]): Column headers, either as a string, or :class:`~rich.table.Column` instance.
+        title (Union[str, Text], optional): The title of the table rendered at the top. Defaults to None.
+        caption (Union[str, Text], optional): The table caption rendered below. Defaults to None.
+        width (int, optional): The width in characters of the table, or ``None`` to automatically fit. Defaults to None.
+        min_width (Optional[int], optional): The minimum width of the table, or ``None`` for no minimum. Defaults to None.
+        box (box.Box, optional): One of the constants in box.py used to draw the edges (see :ref:`appendix_box`), or ``None`` for no box lines. Defaults to box.HEAVY_HEAD.
+        safe_box (Optional[bool], optional): Disable box characters that don't display on windows legacy terminal with *raster* fonts. Defaults to True.
+        padding (PaddingDimensions, optional): Padding for cells (top, right, bottom, left). Defaults to (0, 1).
+        collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to False.
+        pad_edge (bool, optional): Enable padding of edge cells. Defaults to True.
+        expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+        show_header (bool, optional): Show a header row. Defaults to True.
+        show_footer (bool, optional): Show a footer row. Defaults to False.
+        show_edge (bool, optional): Draw a box around the outside of the table. Defaults to True.
+        show_lines (bool, optional): Draw lines between every row. Defaults to False.
+        leading (bool, optional): Number of blank lines between rows (precludes ``show_lines``). Defaults to 0.
+        style (Union[str, Style], optional): Default style for the table. Defaults to "none".
+        row_styles (List[Union, str], optional): Optional list of row styles, if more than one style is given then the styles will alternate. Defaults to None.
+        header_style (Union[str, Style], optional): Style of the header. Defaults to "table.header".
+        footer_style (Union[str, Style], optional): Style of the footer. Defaults to "table.footer".
+        border_style (Union[str, Style], optional): Style of the border. Defaults to None.
+        title_style (Union[str, Style], optional): Style of the title. Defaults to None.
+        caption_style (Union[str, Style], optional): Style of the caption. Defaults to None.
+        title_justify (str, optional): Justify method for title. Defaults to "center".
+        caption_justify (str, optional): Justify method for caption. Defaults to "center".
+        highlight (bool, optional): Highlight cell contents (if str). Defaults to False.
+    """
+
+    columns: List[Column]
+    rows: List[Row]
+
+    def __init__(
+        self,
+        *headers: Union[Column, str],
+        title: Optional[TextType] = None,
+        caption: Optional[TextType] = None,
+        width: Optional[int] = None,
+        min_width: Optional[int] = None,
+        box: Optional[box.Box] = box.HEAVY_HEAD,
+        safe_box: Optional[bool] = None,
+        padding: PaddingDimensions = (0, 1),
+        collapse_padding: bool = False,
+        pad_edge: bool = True,
+        expand: bool = False,
+        show_header: bool = True,
+        show_footer: bool = False,
+        show_edge: bool = True,
+        show_lines: bool = False,
+        leading: int = 0,
+        style: StyleType = "none",
+        row_styles: Optional[Iterable[StyleType]] = None,
+        header_style: Optional[StyleType] = "table.header",
+        footer_style: Optional[StyleType] = "table.footer",
+        border_style: Optional[StyleType] = None,
+        title_style: Optional[StyleType] = None,
+        caption_style: Optional[StyleType] = None,
+        title_justify: "JustifyMethod" = "center",
+        caption_justify: "JustifyMethod" = "center",
+        highlight: bool = False,
+    ) -> None:
+        self.columns: List[Column] = []
+        self.rows: List[Row] = []
+        self.title = title
+        self.caption = caption
+        self.width = width
+        self.min_width = min_width
+        self.box = box
+        self.safe_box = safe_box
+        self._padding = Padding.unpack(padding)
+        self.pad_edge = pad_edge
+        self._expand = expand
+        self.show_header = show_header
+        self.show_footer = show_footer
+        self.show_edge = show_edge
+        self.show_lines = show_lines
+        self.leading = leading
+        self.collapse_padding = collapse_padding
+        self.style = style
+        self.header_style = header_style or ""
+        self.footer_style = footer_style or ""
+        self.border_style = border_style
+        self.title_style = title_style
+        self.caption_style = caption_style
+        self.title_justify: "JustifyMethod" = title_justify
+        self.caption_justify: "JustifyMethod" = caption_justify
+        self.highlight = highlight
+        self.row_styles: Sequence[StyleType] = list(row_styles or [])
+        append_column = self.columns.append
+        for header in headers:
+            if isinstance(header, str):
+                self.add_column(header=header)
+            else:
+                header._index = len(self.columns)
+                append_column(header)
+
+    @classmethod
+    def grid(
+        cls,
+        *headers: Union[Column, str],
+        padding: PaddingDimensions = 0,
+        collapse_padding: bool = True,
+        pad_edge: bool = False,
+        expand: bool = False,
+    ) -> "Table":
+        """Get a table with no lines, headers, or footer.
+
+        Args:
+            *headers (Union[Column, str]): Column headers, either as a string, or :class:`~rich.table.Column` instance.
+            padding (PaddingDimensions, optional): Get padding around cells. Defaults to 0.
+            collapse_padding (bool, optional): Enable collapsing of padding around cells. Defaults to True.
+            pad_edge (bool, optional): Enable padding around edges of table. Defaults to False.
+            expand (bool, optional): Expand the table to fit the available space if ``True``, otherwise the table width will be auto-calculated. Defaults to False.
+
+        Returns:
+            Table: A table instance.
+        """
+        return cls(
+            *headers,
+            box=None,
+            padding=padding,
+            collapse_padding=collapse_padding,
+            show_header=False,
+            show_footer=False,
+            show_edge=False,
+            pad_edge=pad_edge,
+            expand=expand,
+        )
+
+    @property
+    def expand(self) -> bool:
+        """Setting a non-None self.width implies expand."""
+        return self._expand or self.width is not None
+
+    @expand.setter
+    def expand(self, expand: bool) -> None:
+        """Set expand."""
+        self._expand = expand
+
+    @property
+    def _extra_width(self) -> int:
+        """Get extra width to add to cell content."""
+        width = 0
+        if self.box and self.show_edge:
+            width += 2
+        if self.box:
+            width += len(self.columns) - 1
+        return width
+
+    @property
+    def row_count(self) -> int:
+        """Get the current number of rows."""
+        return len(self.rows)
+
+    def get_row_style(self, console: "Console", index: int) -> StyleType:
+        """Get the current row style."""
+        style = Style.null()
+        if self.row_styles:
+            style += console.get_style(self.row_styles[index % len(self.row_styles)])
+        row_style = self.rows[index].style
+        if row_style is not None:
+            style += console.get_style(row_style)
+        return style
+
+    def __rich_measure__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> Measurement:
+        max_width = options.max_width
+        if self.width is not None:
+            max_width = self.width
+        if max_width < 0:
+            return Measurement(0, 0)
+
+        extra_width = self._extra_width
+        max_width = sum(
+            self._calculate_column_widths(
+                console, options.update_width(max_width - extra_width)
+            )
+        )
+        _measure_column = self._measure_column
+
+        measurements = [
+            _measure_column(console, options.update_width(max_width), column)
+            for column in self.columns
+        ]
+        minimum_width = (
+            sum(measurement.minimum for measurement in measurements) + extra_width
+        )
+        maximum_width = (
+            sum(measurement.maximum for measurement in measurements) + extra_width
+            if (self.width is None)
+            else self.width
+        )
+        measurement = Measurement(minimum_width, maximum_width)
+        measurement = measurement.clamp(self.min_width)
+        return measurement
+
+    @property
+    def padding(self) -> Tuple[int, int, int, int]:
+        """Get cell padding."""
+        return self._padding
+
+    @padding.setter
+    def padding(self, padding: PaddingDimensions) -> "Table":
+        """Set cell padding."""
+        self._padding = Padding.unpack(padding)
+        return self
+
+    def add_column(
+        self,
+        header: "RenderableType" = "",
+        footer: "RenderableType" = "",
+        *,
+        header_style: Optional[StyleType] = None,
+        footer_style: Optional[StyleType] = None,
+        style: Optional[StyleType] = None,
+        justify: "JustifyMethod" = "left",
+        vertical: "VerticalAlignMethod" = "top",
+        overflow: "OverflowMethod" = "ellipsis",
+        width: Optional[int] = None,
+        min_width: Optional[int] = None,
+        max_width: Optional[int] = None,
+        ratio: Optional[int] = None,
+        no_wrap: bool = False,
+    ) -> None:
+        """Add a column to the table.
+
+        Args:
+            header (RenderableType, optional): Text or renderable for the header.
+                Defaults to "".
+            footer (RenderableType, optional): Text or renderable for the footer.
+                Defaults to "".
+            header_style (Union[str, Style], optional): Style for the header, or None for default. Defaults to None.
+            footer_style (Union[str, Style], optional): Style for the footer, or None for default. Defaults to None.
+            style (Union[str, Style], optional): Style for the column cells, or None for default. Defaults to None.
+            justify (JustifyMethod, optional): Alignment for cells. Defaults to "left".
+            vertical (VerticalAlignMethod, optional): Vertical alignment, one of "top", "middle", or "bottom". Defaults to "top".
+            overflow (OverflowMethod): Overflow method: "crop", "fold", "ellipsis". Defaults to "ellipsis".
+            width (int, optional): Desired width of column in characters, or None to fit to contents. Defaults to None.
+            min_width (Optional[int], optional): Minimum width of column, or ``None`` for no minimum. Defaults to None.
+            max_width (Optional[int], optional): Maximum width of column, or ``None`` for no maximum. Defaults to None.
+            ratio (int, optional): Flexible ratio for the column (requires ``Table.expand`` or ``Table.width``). Defaults to None.
+            no_wrap (bool, optional): Set to ``True`` to disable wrapping of this column.
+        """
+
+        column = Column(
+            _index=len(self.columns),
+            header=header,
+            footer=footer,
+            header_style=header_style or "",
+            footer_style=footer_style or "",
+            style=style or "",
+            justify=justify,
+            vertical=vertical,
+            overflow=overflow,
+            width=width,
+            min_width=min_width,
+            max_width=max_width,
+            ratio=ratio,
+            no_wrap=no_wrap,
+        )
+        self.columns.append(column)
+
+    def add_row(
+        self,
+        *renderables: Optional["RenderableType"],
+        style: Optional[StyleType] = None,
+        end_section: bool = False,
+    ) -> None:
+        """Add a row of renderables.
+
+        Args:
+            *renderables (None or renderable): Each cell in a row must be a renderable object (including str),
+                or ``None`` for a blank cell.
+            style (StyleType, optional): An optional style to apply to the entire row. Defaults to None.
+            end_section (bool, optional): End a section and draw a line. Defaults to False.
+
+        Raises:
+            errors.NotRenderableError: If you add something that can't be rendered.
+        """
+
+        def add_cell(column: Column, renderable: "RenderableType") -> None:
+            column._cells.append(renderable)
+
+        cell_renderables: List[Optional["RenderableType"]] = list(renderables)
+
+        columns = self.columns
+        if len(cell_renderables) < len(columns):
+            cell_renderables = [
+                *cell_renderables,
+                *[None] * (len(columns) - len(cell_renderables)),
+            ]
+        for index, renderable in enumerate(cell_renderables):
+            if index == len(columns):
+                column = Column(_index=index)
+                for _ in self.rows:
+                    add_cell(column, Text(""))
+                self.columns.append(column)
+            else:
+                column = columns[index]
+            if renderable is None:
+                add_cell(column, "")
+            elif is_renderable(renderable):
+                add_cell(column, renderable)
+            else:
+                raise errors.NotRenderableError(
+                    f"unable to render {type(renderable).__name__}; a string or other renderable object is required"
+                )
+        self.rows.append(Row(style=style, end_section=end_section))
+
+    def add_section(self) -> None:
+        """Add a new section (draw a line after current row)."""
+
+        if self.rows:
+            self.rows[-1].end_section = True
+
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        if not self.columns:
+            yield Segment("\n")
+            return
+
+        max_width = options.max_width
+        if self.width is not None:
+            max_width = self.width
+
+        extra_width = self._extra_width
+        widths = self._calculate_column_widths(
+            console, options.update_width(max_width - extra_width)
+        )
+        table_width = sum(widths) + extra_width
+
+        render_options = options.update(
+            width=table_width, highlight=self.highlight, height=None
+        )
+
+        def render_annotation(
+            text: TextType, style: StyleType, justify: "JustifyMethod" = "center"
+        ) -> "RenderResult":
+            render_text = (
+                console.render_str(text, style=style, highlight=False)
+                if isinstance(text, str)
+                else text
+            )
+            return console.render(
+                render_text, options=render_options.update(justify=justify)
+            )
+
+        if self.title:
+            yield from render_annotation(
+                self.title,
+                style=Style.pick_first(self.title_style, "table.title"),
+                justify=self.title_justify,
+            )
+        yield from self._render(console, render_options, widths)
+        if self.caption:
+            yield from render_annotation(
+                self.caption,
+                style=Style.pick_first(self.caption_style, "table.caption"),
+                justify=self.caption_justify,
+            )
+
+    def _calculate_column_widths(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> List[int]:
+        """Calculate the widths of each column, including padding, not including borders."""
+        max_width = options.max_width
+        columns = self.columns
+        width_ranges = [
+            self._measure_column(console, options, column) for column in columns
+        ]
+        widths = [_range.maximum or 1 for _range in width_ranges]
+        get_padding_width = self._get_padding_width
+        extra_width = self._extra_width
+        if self.expand:
+            ratios = [col.ratio or 0 for col in columns if col.flexible]
+            if any(ratios):
+                fixed_widths = [
+                    0 if column.flexible else _range.maximum
+                    for _range, column in zip(width_ranges, columns)
+                ]
+                flex_minimum = [
+                    (column.width or 1) + get_padding_width(column._index)
+                    for column in columns
+                    if column.flexible
+                ]
+                flexible_width = max_width - sum(fixed_widths)
+                flex_widths = ratio_distribute(flexible_width, ratios, flex_minimum)
+                iter_flex_widths = iter(flex_widths)
+                for index, column in enumerate(columns):
+                    if column.flexible:
+                        widths[index] = fixed_widths[index] + next(iter_flex_widths)
+        table_width = sum(widths)
+
+        if table_width > max_width:
+            widths = self._collapse_widths(
+                widths,
+                [(column.width is None and not column.no_wrap) for column in columns],
+                max_width,
+            )
+            table_width = sum(widths)
+            # last resort, reduce columns evenly
+            if table_width > max_width:
+                excess_width = table_width - max_width
+                widths = ratio_reduce(excess_width, [1] * len(widths), widths, widths)
+                table_width = sum(widths)
+
+            width_ranges = [
+                self._measure_column(console, options.update_width(width), column)
+                for width, column in zip(widths, columns)
+            ]
+            widths = [_range.maximum or 0 for _range in width_ranges]
+
+        if (table_width < max_width and self.expand) or (
+            self.min_width is not None and table_width < (self.min_width - extra_width)
+        ):
+            _max_width = (
+                max_width
+                if self.min_width is None
+                else min(self.min_width - extra_width, max_width)
+            )
+            pad_widths = ratio_distribute(_max_width - table_width, widths)
+            widths = [_width + pad for _width, pad in zip(widths, pad_widths)]
+
+        return widths
+
+    @classmethod
+    def _collapse_widths(
+        cls, widths: List[int], wrapable: List[bool], max_width: int
+    ) -> List[int]:
+        """Reduce widths so that the total is under max_width.
+
+        Args:
+            widths (List[int]): List of widths.
+            wrapable (List[bool]): List of booleans that indicate if a column may shrink.
+            max_width (int): Maximum width to reduce to.
+
+        Returns:
+            List[int]: A new list of widths.
+        """
+        total_width = sum(widths)
+        excess_width = total_width - max_width
+        if any(wrapable):
+            while total_width and excess_width > 0:
+                max_column = max(
+                    width for width, allow_wrap in zip(widths, wrapable) if allow_wrap
+                )
+                second_max_column = max(
+                    width if allow_wrap and width != max_column else 0
+                    for width, allow_wrap in zip(widths, wrapable)
+                )
+                column_difference = max_column - second_max_column
+                ratios = [
+                    (1 if (width == max_column and allow_wrap) else 0)
+                    for width, allow_wrap in zip(widths, wrapable)
+                ]
+                if not any(ratios) or not column_difference:
+                    break
+                max_reduce = [min(excess_width, column_difference)] * len(widths)
+                widths = ratio_reduce(excess_width, ratios, max_reduce, widths)
+
+                total_width = sum(widths)
+                excess_width = total_width - max_width
+        return widths
+
+    def _get_cells(
+        self, console: "Console", column_index: int, column: Column
+    ) -> Iterable[_Cell]:
+        """Get all the cells with padding and optional header."""
+
+        collapse_padding = self.collapse_padding
+        pad_edge = self.pad_edge
+        padding = self.padding
+        any_padding = any(padding)
+
+        first_column = column_index == 0
+        last_column = column_index == len(self.columns) - 1
+
+        _padding_cache: Dict[Tuple[bool, bool], Tuple[int, int, int, int]] = {}
+
+        def get_padding(first_row: bool, last_row: bool) -> Tuple[int, int, int, int]:
+            cached = _padding_cache.get((first_row, last_row))
+            if cached:
+                return cached
+            top, right, bottom, left = padding
+
+            if collapse_padding:
+                if not first_column:
+                    left = max(0, left - right)
+                if not last_row:
+                    bottom = max(0, top - bottom)
+
+            if not pad_edge:
+                if first_column:
+                    left = 0
+                if last_column:
+                    right = 0
+                if first_row:
+                    top = 0
+                if last_row:
+                    bottom = 0
+            _padding = (top, right, bottom, left)
+            _padding_cache[(first_row, last_row)] = _padding
+            return _padding
+
+        raw_cells: List[Tuple[StyleType, "RenderableType"]] = []
+        _append = raw_cells.append
+        get_style = console.get_style
+        if self.show_header:
+            header_style = get_style(self.header_style or "") + get_style(
+                column.header_style
+            )
+            _append((header_style, column.header))
+        cell_style = get_style(column.style or "")
+        for cell in column.cells:
+            _append((cell_style, cell))
+        if self.show_footer:
+            footer_style = get_style(self.footer_style or "") + get_style(
+                column.footer_style
+            )
+            _append((footer_style, column.footer))
+
+        if any_padding:
+            _Padding = Padding
+            for first, last, (style, renderable) in loop_first_last(raw_cells):
+                yield _Cell(
+                    style,
+                    _Padding(renderable, get_padding(first, last)),
+                    getattr(renderable, "vertical", None) or column.vertical,
+                )
+        else:
+            for style, renderable in raw_cells:
+                yield _Cell(
+                    style,
+                    renderable,
+                    getattr(renderable, "vertical", None) or column.vertical,
+                )
+
+    def _get_padding_width(self, column_index: int) -> int:
+        """Get extra width from padding."""
+        _, pad_right, _, pad_left = self.padding
+        if self.collapse_padding:
+            if column_index > 0:
+                pad_left = max(0, pad_left - pad_right)
+        return pad_left + pad_right
+
+    def _measure_column(
+        self,
+        console: "Console",
+        options: "ConsoleOptions",
+        column: Column,
+    ) -> Measurement:
+        """Get the minimum and maximum width of the column."""
+
+        max_width = options.max_width
+        if max_width < 1:
+            return Measurement(0, 0)
+
+        padding_width = self._get_padding_width(column._index)
+
+        if column.width is not None:
+            # Fixed width column
+            return Measurement(
+                column.width + padding_width, column.width + padding_width
+            ).with_maximum(max_width)
+        # Flexible column, we need to measure contents
+        min_widths: List[int] = []
+        max_widths: List[int] = []
+        append_min = min_widths.append
+        append_max = max_widths.append
+        get_render_width = Measurement.get
+        for cell in self._get_cells(console, column._index, column):
+            _min, _max = get_render_width(console, options, cell.renderable)
+            append_min(_min)
+            append_max(_max)
+
+        measurement = Measurement(
+            max(min_widths) if min_widths else 1,
+            max(max_widths) if max_widths else max_width,
+        ).with_maximum(max_width)
+        measurement = measurement.clamp(
+            None if column.min_width is None else column.min_width + padding_width,
+            None if column.max_width is None else column.max_width + padding_width,
+        )
+        return measurement
+
+    def _render(
+        self, console: "Console", options: "ConsoleOptions", widths: List[int]
+    ) -> "RenderResult":
+        table_style = console.get_style(self.style or "")
+
+        border_style = table_style + console.get_style(self.border_style or "")
+        _column_cells = (
+            self._get_cells(console, column_index, column)
+            for column_index, column in enumerate(self.columns)
+        )
+        row_cells: List[Tuple[_Cell, ...]] = list(zip(*_column_cells))
+        _box = (
+            self.box.substitute(
+                options, safe=pick_bool(self.safe_box, console.safe_box)
+            )
+            if self.box
+            else None
+        )
+        _box = _box.get_plain_headed_box() if _box and not self.show_header else _box
+
+        new_line = Segment.line()
+
+        columns = self.columns
+        show_header = self.show_header
+        show_footer = self.show_footer
+        show_edge = self.show_edge
+        show_lines = self.show_lines
+        leading = self.leading
+
+        _Segment = Segment
+        if _box:
+            box_segments = [
+                (
+                    _Segment(_box.head_left, border_style),
+                    _Segment(_box.head_right, border_style),
+                    _Segment(_box.head_vertical, border_style),
+                ),
+                (
+                    _Segment(_box.foot_left, border_style),
+                    _Segment(_box.foot_right, border_style),
+                    _Segment(_box.foot_vertical, border_style),
+                ),
+                (
+                    _Segment(_box.mid_left, border_style),
+                    _Segment(_box.mid_right, border_style),
+                    _Segment(_box.mid_vertical, border_style),
+                ),
+            ]
+            if show_edge:
+                yield _Segment(_box.get_top(widths), border_style)
+                yield new_line
+        else:
+            box_segments = []
+
+        get_row_style = self.get_row_style
+        get_style = console.get_style
+
+        for index, (first, last, row_cell) in enumerate(loop_first_last(row_cells)):
+            header_row = first and show_header
+            footer_row = last and show_footer
+            row = (
+                self.rows[index - show_header]
+                if (not header_row and not footer_row)
+                else None
+            )
+            max_height = 1
+            cells: List[List[List[Segment]]] = []
+            if header_row or footer_row:
+                row_style = Style.null()
+            else:
+                row_style = get_style(
+                    get_row_style(console, index - 1 if show_header else index)
+                )
+            for width, cell, column in zip(widths, row_cell, columns):
+                render_options = options.update(
+                    width=width,
+                    justify=column.justify,
+                    no_wrap=column.no_wrap,
+                    overflow=column.overflow,
+                    height=None,
+                )
+                lines = console.render_lines(
+                    cell.renderable,
+                    render_options,
+                    style=get_style(cell.style) + row_style,
+                )
+                max_height = max(max_height, len(lines))
+                cells.append(lines)
+
+            row_height = max(len(cell) for cell in cells)
+
+            def align_cell(
+                cell: List[List[Segment]],
+                vertical: "VerticalAlignMethod",
+                width: int,
+                style: Style,
+            ) -> List[List[Segment]]:
+                if header_row:
+                    vertical = "bottom"
+                elif footer_row:
+                    vertical = "top"
+
+                if vertical == "top":
+                    return _Segment.align_top(cell, width, row_height, style)
+                elif vertical == "middle":
+                    return _Segment.align_middle(cell, width, row_height, style)
+                return _Segment.align_bottom(cell, width, row_height, style)
+
+            cells[:] = [
+                _Segment.set_shape(
+                    align_cell(
+                        cell,
+                        _cell.vertical,
+                        width,
+                        get_style(_cell.style) + row_style,
+                    ),
+                    width,
+                    max_height,
+                )
+                for width, _cell, cell, column in zip(widths, row_cell, cells, columns)
+            ]
+
+            if _box:
+                if last and show_footer:
+                    yield _Segment(
+                        _box.get_row(widths, "foot", edge=show_edge), border_style
+                    )
+                    yield new_line
+                left, right, _divider = box_segments[0 if first else (2 if last else 1)]
+
+                # If the column divider is whitespace also style it with the row background
+                divider = (
+                    _divider
+                    if _divider.text.strip()
+                    else _Segment(
+                        _divider.text, row_style.background_style + _divider.style
+                    )
+                )
+                for line_no in range(max_height):
+                    if show_edge:
+                        yield left
+                    for last_cell, rendered_cell in loop_last(cells):
+                        yield from rendered_cell[line_no]
+                        if not last_cell:
+                            yield divider
+                    if show_edge:
+                        yield right
+                    yield new_line
+            else:
+                for line_no in range(max_height):
+                    for rendered_cell in cells:
+                        yield from rendered_cell[line_no]
+                    yield new_line
+            if _box and first and show_header:
+                yield _Segment(
+                    _box.get_row(widths, "head", edge=show_edge), border_style
+                )
+                yield new_line
+            end_section = row and row.end_section
+            if _box and (show_lines or leading or end_section):
+                if (
+                    not last
+                    and not (show_footer and index >= len(row_cells) - 2)
+                    and not (show_header and header_row)
+                ):
+                    if leading:
+                        yield _Segment(
+                            _box.get_row(widths, "mid", edge=show_edge) * leading,
+                            border_style,
+                        )
+                    else:
+                        yield _Segment(
+                            _box.get_row(widths, "row", edge=show_edge), border_style
+                        )
+                    yield new_line
+
+        if _box and show_edge:
+            yield _Segment(_box.get_bottom(widths), border_style)
+            yield new_line
+
+
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Console
+    from pip._vendor.rich.highlighter import ReprHighlighter
+    from pip._vendor.rich.table import Table as Table
+
+    from ._timer import timer
+
+    with timer("Table render"):
+        table = Table(
+            title="Star Wars Movies",
+            caption="Rich example table",
+            caption_justify="right",
+        )
+
+        table.add_column(
+            "Released", header_style="bright_cyan", style="cyan", no_wrap=True
+        )
+        table.add_column("Title", style="magenta")
+        table.add_column("Box Office", justify="right", style="green")
+
+        table.add_row(
+            "Dec 20, 2019",
+            "Star Wars: The Rise of Skywalker",
+            "$952,110,690",
+        )
+        table.add_row("May 25, 2018", "Solo: A Star Wars Story", "$393,151,347")
+        table.add_row(
+            "Dec 15, 2017",
+            "Star Wars Ep. V111: The Last Jedi",
+            "$1,332,539,889",
+            style="on black",
+            end_section=True,
+        )
+        table.add_row(
+            "Dec 16, 2016",
+            "Rogue One: A Star Wars Story",
+            "$1,332,439,889",
+        )
+
+        def header(text: str) -> None:
+            console.print()
+            console.rule(highlight(text))
+            console.print()
+
+        console = Console()
+        highlight = ReprHighlighter()
+        header("Example Table")
+        console.print(table, justify="center")
+
+        table.expand = True
+        header("expand=True")
+        console.print(table)
+
+        table.width = 50
+        header("width=50")
+
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        header("row_styles=['dim', 'none']")
+
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        table.leading = 1
+        header("leading=1, row_styles=['dim', 'none']")
+        console.print(table, justify="center")
+
+        table.width = None
+        table.expand = False
+        table.row_styles = ["dim", "none"]
+        table.show_lines = True
+        table.leading = 0
+        header("show_lines=True, row_styles=['dim', 'none']")
+        console.print(table, justify="center")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py
new file mode 100644
index 0000000000000000000000000000000000000000..471dfb2f9271c073f0713ca98f8db2f89c975071
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/theme.py
@@ -0,0 +1,115 @@
+import configparser
+from typing import Dict, List, IO, Mapping, Optional
+
+from .default_styles import DEFAULT_STYLES
+from .style import Style, StyleType
+
+
+class Theme:
+    """A container for style information, used by :class:`~rich.console.Console`.
+
+    Args:
+        styles (Dict[str, Style], optional): A mapping of style names on to styles. Defaults to None for a theme with no styles.
+        inherit (bool, optional): Inherit default styles. Defaults to True.
+    """
+
+    styles: Dict[str, Style]
+
+    def __init__(
+        self, styles: Optional[Mapping[str, StyleType]] = None, inherit: bool = True
+    ):
+        self.styles = DEFAULT_STYLES.copy() if inherit else {}
+        if styles is not None:
+            self.styles.update(
+                {
+                    name: style if isinstance(style, Style) else Style.parse(style)
+                    for name, style in styles.items()
+                }
+            )
+
+    @property
+    def config(self) -> str:
+        """Get contents of a config file for this theme."""
+        config = "[styles]\n" + "\n".join(
+            f"{name} = {style}" for name, style in sorted(self.styles.items())
+        )
+        return config
+
+    @classmethod
+    def from_file(
+        cls, config_file: IO[str], source: Optional[str] = None, inherit: bool = True
+    ) -> "Theme":
+        """Load a theme from a text mode file.
+
+        Args:
+            config_file (IO[str]): An open conf file.
+            source (str, optional): The filename of the open file. Defaults to None.
+            inherit (bool, optional): Inherit default styles. Defaults to True.
+
+        Returns:
+            Theme: A New theme instance.
+        """
+        config = configparser.ConfigParser()
+        config.read_file(config_file, source=source)
+        styles = {name: Style.parse(value) for name, value in config.items("styles")}
+        theme = Theme(styles, inherit=inherit)
+        return theme
+
+    @classmethod
+    def read(
+        cls, path: str, inherit: bool = True, encoding: Optional[str] = None
+    ) -> "Theme":
+        """Read a theme from a path.
+
+        Args:
+            path (str): Path to a config file readable by Python configparser module.
+            inherit (bool, optional): Inherit default styles. Defaults to True.
+            encoding (str, optional): Encoding of the config file. Defaults to None.
+
+        Returns:
+            Theme: A new theme instance.
+        """
+        with open(path, "rt", encoding=encoding) as config_file:
+            return cls.from_file(config_file, source=path, inherit=inherit)
+
+
+class ThemeStackError(Exception):
+    """Base exception for errors related to the theme stack."""
+
+
+class ThemeStack:
+    """A stack of themes.
+
+    Args:
+        theme (Theme): A theme instance
+    """
+
+    def __init__(self, theme: Theme) -> None:
+        self._entries: List[Dict[str, Style]] = [theme.styles]
+        self.get = self._entries[-1].get
+
+    def push_theme(self, theme: Theme, inherit: bool = True) -> None:
+        """Push a theme on the top of the stack.
+
+        Args:
+            theme (Theme): A Theme instance.
+            inherit (boolean, optional): Inherit styles from current top of stack.
+        """
+        styles: Dict[str, Style]
+        styles = (
+            {**self._entries[-1], **theme.styles} if inherit else theme.styles.copy()
+        )
+        self._entries.append(styles)
+        self.get = self._entries[-1].get
+
+    def pop_theme(self) -> None:
+        """Pop (and discard) the top-most theme."""
+        if len(self._entries) == 1:
+            raise ThemeStackError("Unable to pop base theme")
+        self._entries.pop()
+        self.get = self._entries[-1].get
+
+
+if __name__ == "__main__":  # pragma: no cover
+    theme = Theme()
+    print(theme.config)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e468bf8cebda1f507a30ff65584c13f6ba59d389
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__init__.py
@@ -0,0 +1,36 @@
+"""Verify certificates using native system trust stores"""
+
+import sys as _sys
+
+if _sys.version_info < (3, 10):
+    raise ImportError("truststore requires Python 3.10 or later")
+
+# Detect Python runtimes which don't implement SSLObject.get_unverified_chain() API
+# This API only became public in Python 3.13 but was available in CPython and PyPy since 3.10.
+if _sys.version_info < (3, 13):
+    try:
+        import ssl as _ssl
+    except ImportError:
+        raise ImportError("truststore requires the 'ssl' module")
+    else:
+        _sslmem = _ssl.MemoryBIO()
+        _sslobj = _ssl.create_default_context().wrap_bio(
+            _sslmem,
+            _sslmem,
+        )
+        try:
+            while not hasattr(_sslobj, "get_unverified_chain"):
+                _sslobj = _sslobj._sslobj  # type: ignore[attr-defined]
+        except AttributeError:
+            raise ImportError(
+                "truststore requires peer certificate chain APIs to be available"
+            ) from None
+
+        del _ssl, _sslobj, _sslmem  # noqa: F821
+
+from ._api import SSLContext, extract_from_ssl, inject_into_ssl  # noqa: E402
+
+del _api, _sys  # type: ignore[name-defined] # noqa: F821
+
+__all__ = ["SSLContext", "inject_into_ssl", "extract_from_ssl"]
+__version__ = "0.10.0"
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b57d409718ab5277e8bfb470cec40461727c906
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..d25a759988d4b3e81308aaf122df32d333f2a964
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_guards.py
@@ -0,0 +1,879 @@
+from __future__ import annotations
+
+import contextlib
+
+import dataclasses
+import enum
+import functools
+import logging
+import threading
+import traceback
+import unittest.mock
+import weakref
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    TypeVar,
+)
+
+import torch
+from torch.utils import _pytree as pytree
+from torch.utils._traceback import CapturedTraceback
+from torch.utils.weak import WeakTensorKeyDictionary
+
+log = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+
+    import sympy
+
+
+"""
+torch._guards is the definitional source of truth for general purpose guard structures.
+
+An important thing to keep in mind here is the preservation of layering. There should be no dynamo notions,
+and no guard installation notions here.
+"""
+
+
+class CompileId(NamedTuple):
+    frame_id: int
+    # This id is per-frame, and counts how many times we've compiled this
+    # frame.  This could have been a global id but having this be per-frame
+    # gives you a better intuitive sense for how many recompiles have occurred
+    # so far.
+    frame_compile_id: int
+    # TODO: consider also tracking the recompilation count
+
+    def __str__(self):
+        return f"{self.frame_id}/{self.frame_compile_id}"
+
+
+class TraceId(NamedTuple):
+    compile_id: CompileId
+    # This starts off as 0, and every time we restart analysis it goes
+    # up by one
+    attempt: int
+
+    def __str__(self):
+        if self.attempt == 0:
+            return str(self.compile_id)
+        else:
+            return f"{self.compile_id}_{self.attempt}"
+
+
+class GuardSource(enum.Enum):
+    LOCAL = 0
+    GLOBAL = 1
+    LOCAL_NN_MODULE = 2
+    GLOBAL_NN_MODULE = 3
+    CONSTANT = 4
+    RANDOM_VALUE = 5
+    SHAPE_ENV = 6
+    LOCAL_FSDP_MODULE = 7
+    GLOBAL_FSDP_MODULE = 8
+    BACKWARD_STATE = 9
+    EPHEMERAL = 10
+    SYNTHETIC_LOCAL = 11
+
+    def is_fsdp_module(self) -> bool:
+        return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
+
+    def is_nn_module(self) -> bool:
+        return (
+            self
+            in (
+                GuardSource.GLOBAL_NN_MODULE,
+                GuardSource.LOCAL_NN_MODULE,
+            )
+            or self.is_fsdp_module()
+        )
+
+    def is_local(self):
+        return self in (
+            GuardSource.LOCAL,
+            GuardSource.LOCAL_NN_MODULE,
+            GuardSource.LOCAL_FSDP_MODULE,
+        )
+
+
+"""
+Base class for a "GuardBuilder" role.
+
+The GuardBuilderBase role is to represent a scope within which to build a guard. The name is a little
+confusing, as its not a builder, but for the sake of avoiding a lot of renames and keeping the original reference
+to torchdynamo's GuardBuilder.
+
+Note: create_fn is invoked with a GuardBuilderBase and a Guard. A GuardBuilder is chosen based
+on GuardSource's select function.
+
+There is value in keeping this GuardBuilderBase empty to keep layering clean.
+"""
+
+
+class GuardBuilderBase:
+    pass
+
+
+class ShapeGuard(NamedTuple):
+    expr: sympy.Expr
+    stack: CapturedTraceback
+
+
+@dataclasses.dataclass
+class Guard:
+    # originating_source is the source that called the make_guard method to
+    # construct this guard object. The property name specifies what exactly it
+    # is the guard is guarding on.  The meaning of the name is dependent on the
+    # create_fn; you must look at the use-site inside create_fn to know what
+    # name means.
+    #
+    # That being said, although you might think this is just a "name", name is
+    # usually an arbitrary Python expression that will be evaluated with all
+    # globals (and locals, if you create a LOCAL guard) to extract the Python
+    # object that we want to perform guard tests on.  This evaluation
+    # typically happens in GuardBuilder.eval.  In these cases, name is
+    # typically produced by originating_source.name() (not to be confused with
+    # GuardSource - the property source).
+    #
+    # Occasionally, name is not a valid Python expression; sometimes
+    # it is meaningless.  Example create_fns that are like this include
+    # GRAD_MODE and SHAPE_ENV.
+    originating_source: Source
+    create_fn: Callable[[GuardBuilderBase, Guard], None]
+
+    # Export only. These values are written to at time of guard check_fn creation.
+    guard_types: Optional[List[str]] = None
+    code_list: Optional[List[str]] = None
+    obj_weakref: Optional[object] = None
+    guarded_class_weakref: Optional[type] = None
+
+    stack: Optional[CapturedTraceback] = None
+    user_stack: Optional[traceback.StackSummary] = None
+    _hash: Optional[int] = None
+
+    def __hash__(self):
+        if self._hash is None:
+            self._hash = hash((self.name, self.source, id(self.create_fn)))
+        return self._hash
+
+    def sort_key(self):
+        return (
+            self.source.value if self.source else -1,
+            len(self.name),
+            self.name,
+            self.inner_create_fn().__code__.co_firstlineno,
+        )
+
+    def __lt__(self, other):
+        return self.sort_key() < other.sort_key()
+
+    def inner_create_fn(self):
+        if isinstance(self.create_fn, functools.partial):
+            return self.create_fn.func
+        else:
+            return self.create_fn
+
+    @property
+    def name(self) -> str:
+        return self.originating_source.name()
+
+    @property
+    def source(self) -> GuardSource:
+        return self.originating_source.guard_source()
+
+    @staticmethod
+    def weakref_to_str(obj_weakref):
+        """
+        This is a workaround of a Python weakref bug.
+
+        `obj_weakref` is instance returned by `weakref.ref`,
+        `str(obj_weakref)` is buggy if the original obj overrides __getattr__, e.g:
+
+            class MyConfig(dict):
+                def __getattr__(self, x):
+                    return self[x]
+
+            obj = MyConfig(offset=5)
+            obj_weakref = weakref.ref(obj)
+            str(obj_weakref)  # raise error: KeyError: '__name__'
+        """
+        if isinstance(obj_weakref, weakref.ReferenceType):
+            obj = obj_weakref()
+            if obj is not None:
+                return f""
+            else:
+                return f""
+        else:
+            return str(obj_weakref)
+
+    def __repr__(self):
+        s = f"""
+        {self.source.name.lower() if self.source else ""} {repr(self.name)} {self.inner_create_fn().__name__}
+        {{
+            'guard_types': {self.guard_types},
+            'code': {self.code_list},
+            'obj_weakref': {self.weakref_to_str(self.obj_weakref)}
+            'guarded_class': {self.guarded_class_weakref}
+        }}
+        """
+        return s
+
+    def __str__(self):
+        output = f"Name: {repr(self.name)}\n"
+        source = self.source.name.lower() if self.source else ""
+        output += f"    Source: {source}\n"
+        output += f"    Create Function: {self.inner_create_fn().__name__}\n"
+        output += f"    Guard Types: {self.guard_types}\n"
+        output += f"    Code List: {self.code_list}\n"
+        output += f"    Object Weakref: {self.weakref_to_str(self.obj_weakref)}\n"
+        output += f"    Guarded Class Weakref: {self.guarded_class_weakref}\n"
+        return output
+
+    def create(self, builder: GuardBuilderBase):
+        try:
+            return self.create_fn(builder, self)
+        except Exception:
+            log.error("Error while creating guard:\n%s", str(self).rstrip())
+            if self.stack:
+                log.error("Created at:\n%s", "".join(self.stack.format()[-4:]).rstrip())
+            raise
+
+    def is_nn_module(self):
+        return self.source.is_nn_module()
+
+    def is_fsdp_module(self):
+        return self.source.is_fsdp_module()
+
+    def is_local(self):
+        return self.source.is_local()
+
+    def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
+        if not self.guard_types:
+            self.guard_types = list()
+
+        self.guard_types.append(guard_type)
+
+        assert self.guarded_class_weakref in (
+            guarded_class,
+            None,
+        ), "Guarded class id must be identical, or None"
+        self.guarded_class_weakref = guarded_class
+
+        if not self.code_list:
+            self.code_list = code_list
+        else:
+            self.code_list.extend(code_list)
+
+        assert self.obj_weakref in (
+            obj_weakref,
+            None,
+        ), "Guarded object must be identical, or None"
+        self.obj_weakref = obj_weakref
+
+
+T = TypeVar("T")
+
+"""
+Parent structure for guard env expressions.
+A GuardEnvExpr can have any subtype.
+Note: All subtypes must be handled exhaustively in
+torch._dynamo.guards._parse_guard_env_guards to avoid a RuntimeError.
+"""
+
+
+@dataclasses.dataclass
+class GuardEnvExpr:
+    pass
+
+
+"""
+A class representing a pair of duplicate inputs.
+input_pos_a and input_pos_b are input positions we have deduped.
+"""
+
+
+@dataclasses.dataclass
+class DuplicateInputs(GuardEnvExpr):
+    input_source_a: Source
+    input_source_b: Source
+
+    def __post_init__(self):
+        assert self.input_source_a != self.input_source_b
+
+
+"""
+Checkpointable is an interface for driving state snapshotting, left purposely vague for now.
+
+copy_graphstate() -> T, a somewhat legacy name, is expected to emit a snapshot of any type that
+can also be taken in at restore_graphstate(T) calls.
+
+When to snapshot, is, at the moment, an implementation detail of upstream callers. Checkpointable
+does not provide any garuantees around consistency, idempotency, or safety of calling its APIs, yet.
+
+In the future, it will have a closer coupling to a generic Checkpoint management system.
+"""
+
+
+class Checkpointable(ABC, Generic[T]):
+    @abstractmethod
+    def copy_graphstate(self) -> T:
+        ...
+
+    @abstractmethod
+    def restore_graphstate(self, state: T):
+        ...
+
+
+class GuardsCheckpointState:
+    """
+    The GuardCheckpointState - it is the T of Checkpointable[T] for GuardsContext
+    """
+
+    dynamo_guards: Set[Guard] = set()
+
+    def __init__(self, dynamo_guards):
+        self.dynamo_guards = dynamo_guards
+
+    def diff(self, other):
+        """
+        Produces a delta against another GuardsCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        Guard type objects.
+        """
+        r = self.dynamo_guards.difference(other.dynamo_guards)
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContextCheckpointState:
+    nn_modules: Dict[str, torch.nn.Module] = {}
+
+    def __init__(self, nn_modules):
+        self.nn_modules = nn_modules
+
+    def diff(self, other):
+        """
+        Produces a delta against another ModuleContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        module key names.
+        """
+        r = set(self.nn_modules.keys()).difference(set(other.nn_modules.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContext(Checkpointable[ModuleContextCheckpointState]):
+    def __init__(self):
+        self.nn_modules: Dict[str, Any] = {}
+
+    def copy_graphstate(self):
+        return ModuleContextCheckpointState(dict(self.nn_modules))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, ModuleContextCheckpointState)
+        self.nn_modules = state.nn_modules
+
+
+class GlobalContextCheckpointState:
+    global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def __init__(self, global_states):
+        self.global_state = global_states
+
+    def diff(self, other):
+        """
+        Produces a delta against another GlobalContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        global key names.
+        """
+        r = set(self.global_state.keys()).difference(set(other.global_state.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
+    """
+    This keeps track of the global torch state during tracing of a function.
+    For example, torch.is_grad_enabled.
+    """
+
+    _supported_global_states = {
+        "grad_enabled",
+        "torch_function_enabled",
+        "autocast_enabled",
+        "autocast_cpu_enabled",
+        "autocast_gpu_dtype",
+        "autocast_cpu_dtype",
+        "autocast_cache_enabled",
+    }
+
+    def __init__(self):
+        self.global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def copy_graphstate(self):
+        return GlobalContextCheckpointState(dict(self.global_state))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, GlobalContextCheckpointState)
+        self.global_state = state.global_state
+        assert (
+            len(self.global_state) == len(self._supported_global_states)
+            and set(self.global_state.keys()) == self._supported_global_states
+        ), "Global state mismatch"
+        for func, args in self.global_state.values():
+            func(args)
+
+
+"""
+A GuardsContext is a checkpointable representation of all the guards in the current tracing
+context. It's lifecycle is bound 1:1 to the tracing context, and it should never be instantiated
+directly outside of it. For passing around internal state representations of this object,
+prefer to extract them with copy_graphstate to produce a GuardsCheckpointState.
+"""
+
+
+# Like a Set[Guard] but will record the user stack on all guards at the
+# time they were installed at their destination
+class GuardsSet:
+    def __init__(self, inner=None):
+        if inner is None:
+            inner = set()
+        self.inner = inner
+
+    def __iter__(self):
+        return iter(self.inner)
+
+    def __len__(self):
+        return len(self.inner)
+
+    # Subtraction along with bool is typically used to determine the delta of
+    # added guards between checkpoints for higher order ops
+    def __sub__(self, other):
+        return GuardsSet(self.inner - other.inner)
+
+    def __bool__(self):
+        return bool(self.inner)
+
+    def add(self, guard: Guard, *, collect_debug_stack=True, skip=0):
+        if guard in self.inner:
+            return
+        if collect_debug_stack:
+            if guard.stack is None:
+                guard.stack = CapturedTraceback.extract(skip=1 + skip)
+            if guard.user_stack is None:
+                guard.user_stack = TracingContext.extract_stack()
+        self.inner.add(guard)
+
+    def update(self, *others: Set[Guard]):
+        for o in others:
+            for g in o:
+                self.add(g, skip=1)
+
+    def remove_guards_with_source(self, source):
+        """Delete all guards with a given source"""
+        self.inner = {g for g in self.inner if g.originating_source != source}
+
+
+class GuardsContext(Checkpointable[GuardsCheckpointState]):
+    def __init__(self):
+        self.dynamo_guards: GuardsSet = GuardsSet()
+        self.aotautograd_guards: List[GuardEnvExpr] = []
+
+    def copy_graphstate(self):
+        return GuardsCheckpointState(set(self.dynamo_guards.inner))
+
+    def restore_graphstate(self, state):
+        # NB: "steals" the passed in state
+        assert isinstance(state, GuardsCheckpointState)
+        self.dynamo_guards = GuardsSet(state.dynamo_guards)
+
+
+_TLS = threading.local()
+
+"""
+TracingContext is the source of truth for all currently accumulated information
+needed to trace. Its lifecycle is kept 1:1 when using TorchDynamo, but other systems
+are open to managing their own TracingContext with that in mind.
+
+The purpose of TracingContext is not to be a dumping ground, or god object, but rather to avoid
+having to plumb complex subsystems across multiple verticals.
+
+Ex: A common example is guard accumulation between dynamo, shape_env, aot_autograd, and inductor.
+Accessing the current tracing context via
+TracingContext.get() allows users to accumulate their own guards for processing, without needing to know how
+to plumb objects back up to where frame interpretation happened.
+
+Note that you can end up with multiple TracingContext for a single compilation
+of a frame, as we reset the TracingContext whenever we restart analysis.
+CompileContext is a more overarching context that encompasses multiple restarts.
+"""
+
+
+class CompileContext:
+    @staticmethod
+    def get() -> CompileContext:
+        assert _TLS.compile_context is not None
+        return _TLS.compile_context
+
+    @staticmethod
+    def try_get() -> Optional[CompileContext]:
+        return getattr(_TLS, "compile_context", None)
+
+    def __init__(self, compile_id):
+        assert compile_id is None or isinstance(compile_id, CompileId)
+        self.compile_id: Optional[CompileId] = compile_id
+        self.attempt = 0
+
+    @staticmethod
+    def current_compile_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        return self.compile_id
+
+    @staticmethod
+    def current_trace_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        if self.compile_id is None:
+            return None
+        return TraceId(self.compile_id, self.attempt)
+
+
+class TracingContext:
+    """
+    Provides the currently installed TracingContext, or None.
+
+    Note that it is a staticmethod, and invocations outside of `with tracing()` (see below), are valid but
+    will return None.
+    """
+
+    @staticmethod
+    def try_get() -> Optional[TracingContext]:
+        return getattr(_TLS, "tracing_context", None)
+
+    @staticmethod
+    def get() -> TracingContext:
+        if ctx := TracingContext.try_get():
+            return ctx
+        raise RuntimeError(
+            "TracingContext.get() must be called within an ongoing trace."
+        )
+
+    def __init__(self, fake_mode):
+        self.guards_context = GuardsContext()
+        self.module_context = ModuleContext()
+        self.global_context = GlobalContext()
+        self.fake_mode = fake_mode
+        self.frame_summary_stack = []
+        # This is morally part of frame_summary_stack, but it is kept separate
+        # for clarity.  As we process a frame, this variable gets updated
+        # to keep track of what line we are in the function.  We make a
+        # function call, this gets cleared and the frame location is pushed
+        # to frame_summary_stack (prepping this variable for the inner frame's
+        # progress)
+        self.loc_in_frame = None
+        # this is only set after aot_autograd
+        self.fw_metadata = None
+        self.params_flat = None
+        # this is for extended return calling convention from backend
+        # compiler to aot_autograd
+        # Per output, what the compiler specified stride of the output is,
+        # or None if no stride is known.  This is always the HINT, it
+        # is never a SymInt (it would be better if it was a SymInt, but
+        # I can't conveniently get this from Inductor atm.  Also, be
+        # careful not to accidentally induce guards on the SymInt if
+        # you ever do change this in aot_autograd.py; you should check
+        # on permutations preferentially.)
+        self.output_strides: Optional[List[Optional[List[int]]]] = None
+        # When this is True, whenever we encounter an int in Dynamo tracing,
+        # we will (1) force unspec it and (2) force it as a size-like unbacked
+        # integer.  This is currently used when processing certain lists of
+        # ints that are known to be size-like and may have 0/1 entries that we
+        # must not specialize on.
+        self.force_unspec_int_unbacked_size_like = False
+        # See note [Tensor Fakification and Symbol Caching]
+        self.tensor_to_context = WeakTensorKeyDictionary()
+
+        # If this true, Aot Autograd will return output Fake Tensors with appropiate
+        # meta on the first invocation
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        self.fakify_first_call = False
+
+    def clear(self):
+        # Look at the note in output_graph.py in function `save_global_state`
+        # for the context on clearing global context.
+        self.global_context.global_state = {}
+
+    @staticmethod
+    @contextmanager
+    def patch(**kwargs):
+        prior = {}
+        ctx = TracingContext.get()
+
+        for key in kwargs.keys():
+            # KeyError on invalid entry
+            prior[key] = getattr(ctx, key)
+        for key, val in kwargs.items():
+            setattr(ctx, key, val)
+        try:
+            yield
+        finally:
+            for key, val in prior.items():
+                setattr(ctx, key, val)
+
+    @staticmethod
+    def extract_stack():
+        self = TracingContext.try_get()
+        if self is None:
+            return traceback.StackSummary()
+        stack = self.frame_summary_stack
+        if self.loc_in_frame is not None:
+            stack = stack + [self.loc_in_frame]
+        return traceback.StackSummary.from_list(stack)
+
+    # Call this when you want to call into some code that isn't necessarily
+    # associated with the current frame state
+    @staticmethod
+    @contextlib.contextmanager
+    def clear_frame():
+        tc = TracingContext.get()
+        with unittest.mock.patch.object(
+            tc, "frame_summary_stack", []
+        ), unittest.mock.patch.object(tc, "loc_in_frame", None):
+            try:
+                yield
+            except Exception as e:
+                # Prevent real_stack from getting attached
+                #
+                # The invariant is that if an Exception as real_stack, we've
+                # appropriately attached a user stack and we no longer need to
+                # attach anything. Because we cannot conveniently interpose
+                # when an exception is thrown, we instead interpose everywhere
+                # we set what the user stack is set (using the context
+                # manager). However, our compiler stack does "tail calls"
+                # (when it calls into user compiler), at which point the
+                # parent exception frames would incorrectly attach an
+                # incorrect frame.
+                #
+                # However, if, somehow, someone raised an exception with this
+                # scope that had a stack (for example, because they are
+                # restoring the user stack state appropriately as they process
+                # node by node), we should respect it. Thus, we cannot
+                # unconditionally set None.
+                if not hasattr(e, "real_stack"):
+                    e.real_stack = None  # type: ignore[attr-defined]
+                raise
+
+    @staticmethod
+    @contextlib.contextmanager
+    def current_frame(frame_summary):
+        # frame_summary can be None to solely take advantage of real_stack
+        # attachment to thrown exceptions
+        tc = TracingContext.get()
+        if frame_summary is not None:
+            tc.frame_summary_stack.append(frame_summary)
+        old = tc.loc_in_frame
+        tc.loc_in_frame = None
+        try:
+            yield
+        except Exception as e:
+            if not hasattr(e, "real_stack"):
+                e.real_stack = tc.extract_stack()  # type: ignore[attr-defined]
+            raise
+        finally:
+            if frame_summary is not None:
+                tc.frame_summary_stack.pop()
+            tc.loc_in_frame = old
+
+    @staticmethod
+    @contextlib.contextmanager
+    def report_output_strides():
+        tc = TracingContext.try_get()
+        if tc is None:
+            yield None
+            return
+        old_output_strides = tc.output_strides
+        tc.output_strides = []
+        try:
+            yield tc.output_strides
+        finally:
+            tc.output_strides = old_output_strides
+
+    @staticmethod
+    def set_current_loc(filename, lineno, frame_name):
+        TracingContext.get().loc_in_frame = traceback.FrameSummary(
+            filename, lineno, frame_name
+        )
+
+
+@contextmanager
+def compile_context(context: CompileContext):
+    old_context = getattr(_TLS, "compile_context", None)
+    _TLS.compile_context = context
+    try:
+        yield context
+    finally:
+        _TLS.compile_context = old_context
+
+
+@contextmanager
+def tracing(context: Optional[TracingContext]):
+    """
+    This function installs the passed in tracing context as a dynamic scoped
+    global variable.
+
+    Calls to TracingContext.get() while not under a `with tracing()` context
+    will return None.
+    """
+    old_context = getattr(_TLS, "tracing_context", None)
+    _TLS.tracing_context = context
+    try:
+        yield context
+    except Exception as e:
+        if not hasattr(e, "real_stack") and context is not None:
+            e.real_stack = context.extract_stack()  # type: ignore[attr-defined]
+        raise
+    finally:
+        if (
+            context is not None
+            and context.fake_mode is not None
+            and context.fake_mode.shape_env is not None
+        ):
+            context.fake_mode.shape_env.cleanup()
+        _TLS.tracing_context = old_context
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+# TODO(voz): Consider a toplevel torch/_source.py
+@dataclasses.dataclass(frozen=True)
+class Source:
+    def is_dict_key(self):
+        return False
+
+    def is_ephemeral(self):
+        return False
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError()
+
+    def guard_source(self) -> GuardSource:
+        raise NotImplementedError()
+
+    def name(self) -> str:
+        raise NotImplementedError()
+
+    def make_guard(self, fn) -> Guard:
+        if self.guard_source() is GuardSource.CONSTANT:
+            raise NotImplementedError()
+        return Guard(self, fn)
+
+    def is_nn_module(self) -> bool:
+        return self.guard_source().is_nn_module()
+
+    def subguards_allowed(self):
+        """True if you can guard on attributes of this"""
+        return self.guard_source() != GuardSource.SYNTHETIC_LOCAL
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+@dataclasses.dataclass(frozen=True)
+class ChainedSource(Source):
+    base: Source
+
+    def is_dict_key(self):
+        # Recurse until you either hit a ConstDictKey or a Source
+        return self.base.is_dict_key()
+
+    def is_ephemeral(self):
+        return self.base.is_ephemeral()
+
+
+def detect_fake_mode(inputs: Any = None):
+    """
+    Attempts to "detect" what the current fake mode is.  If there is one ambiently
+    available from TracingContext, we preferentially use that.  Otherwise, we
+    heuristically detect the fake mode via the following sources, in order of
+    priority:
+
+        - Currently active fake mode on stack
+        - Fake mode associated with passed in tensors (inputs does not
+          have to be flattened)
+    """
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+
+    fake_modes = []
+
+    if context := TracingContext.try_get():
+        fake_mode = context.fake_mode
+        if fake_mode is not None:
+            fake_modes.append((fake_mode, "tracing context", 0))
+
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for i, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            fake_modes.append((m, "active fake mode", i))
+
+    flat_inputs = pytree.tree_leaves(inputs)
+    for i, flat_input in enumerate(flat_inputs):
+        if isinstance(flat_input, FakeTensor):
+            fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
+
+    if fake_modes:
+        fake_mode, desc1, i1 = fake_modes[0]
+        for m, desc2, i2 in fake_modes[1:]:
+            assert fake_mode is m, (
+                f"fake mode ({fake_mode}) from {desc1} {i1} doesn't match mode ({m}) from {desc2} {i2}\n\n"
+                f"fake mode from {desc1} {i1} allocated at:\n{fake_mode.stack}\n"
+                f"fake mode from {desc2} {i2} allocated at:\n{m.stack}"
+            )
+        return fake_mode
+    else:
+        return None
+
+
+def active_fake_mode():
+    """
+    Inspects the dispatch mode stack for an active fake mode and returns it.
+    Returns None if no fake mode is active.
+    """
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for _, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            return m
+
+    return None
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f56bd8ef2473aa9c35ad6232448c9d5d44b8056
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_sources.py
@@ -0,0 +1,137 @@
+import ast
+import functools
+import inspect
+from textwrap import dedent
+from typing import Any, List, NamedTuple, Optional, Tuple
+
+from torch._C import ErrorReport
+from torch._C._jit_tree_views import SourceRangeFactory
+
+
+def get_source_lines_and_file(
+    obj: Any,
+    error_msg: Optional[str] = None,
+) -> Tuple[List[str], int, Optional[str]]:
+    """
+    Wrapper around inspect.getsourcelines and inspect.getsourcefile.
+
+    Returns: (sourcelines, file_lino, filename)
+    """
+    filename = None  # in case getsourcefile throws
+    try:
+        filename = inspect.getsourcefile(obj)
+        sourcelines, file_lineno = inspect.getsourcelines(obj)
+    except OSError as e:
+        msg = (
+            f"Can't get source for {obj}. TorchScript requires source access in "
+            "order to carry out compilation, make sure original .py files are "
+            "available."
+        )
+        if error_msg:
+            msg += "\n" + error_msg
+        raise OSError(msg) from e
+
+    return sourcelines, file_lineno, filename
+
+
+def normalize_source_lines(sourcelines: List[str]) -> List[str]:
+    """
+    This helper function accepts a list of source lines. It finds the
+    indentation level of the function definition (`def`), then it indents
+    all lines in the function body to a point at or greater than that
+    level. This allows for comments and continued string literals that
+    are at a lower indentation than the rest of the code.
+    Args:
+        sourcelines: function source code, separated into lines by
+                        the '\n' character
+    Returns:
+        A list of source lines that have been correctly aligned
+    """
+
+    def remove_prefix(text, prefix):
+        return text[text.startswith(prefix) and len(prefix) :]
+
+    # Find the line and line number containing the function definition
+    idx = None
+    for i, l in enumerate(sourcelines):
+        if l.lstrip().startswith("def"):
+            idx = i
+            break
+
+    # This will happen when the function is a lambda- we won't find "def" anywhere in the source
+    # lines in that case. Currently trying to JIT compile a lambda will throw an error up in
+    # `parse_def()`, but we might want to handle this case in the future.
+    if idx is None:
+        return sourcelines
+
+    # Get a string representing the amount of leading whitespace
+    fn_def = sourcelines[idx]
+    whitespace = fn_def.split("def")[0]
+
+    # Add this leading whitespace to all lines before and after the `def`
+    aligned_prefix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[:idx]
+    ]
+    aligned_suffix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[idx + 1 :]
+    ]
+
+    # Put it together again
+    aligned_prefix.append(fn_def)
+    return aligned_prefix + aligned_suffix
+
+
+# Thin wrapper around SourceRangeFactory to store extra metadata
+# about the function-to-be-compiled.
+class SourceContext(SourceRangeFactory):
+    def __init__(
+        self,
+        source,
+        filename,
+        file_lineno,
+        leading_whitespace_len,
+        uses_true_division=True,
+        funcname=None,
+    ):
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
+        self.uses_true_division = uses_true_division
+        self.filename = filename
+        self.funcname = funcname
+
+
+@functools.lru_cache(maxsize=None)
+def make_source_context(*args):
+    return SourceContext(*args)
+
+
+def fake_range():
+    return SourceContext("", None, 0, 0).make_raw_range(0, 1)
+
+
+class ParsedDef(NamedTuple):
+    ast: ast.Module
+    ctx: SourceContext
+    source: str
+    filename: Optional[str]
+    file_lineno: int
+
+
+def parse_def(fn):
+    sourcelines, file_lineno, filename = get_source_lines_and_file(
+        fn, ErrorReport.call_stack()
+    )
+    sourcelines = normalize_source_lines(sourcelines)
+    source = "".join(sourcelines)
+    dedent_src = dedent(source)
+    py_ast = ast.parse(dedent_src)
+    if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+        raise RuntimeError(
+            f"Expected a single top-level function: {filename}:{file_lineno}"
+        )
+    leading_whitespace_len = len(source.split("\n", 1)[0]) - len(
+        dedent_src.split("\n", 1)[0]
+    )
+    ctx = make_source_context(
+        source, filename, file_lineno, leading_whitespace_len, True, fn.__name__
+    )
+    return ParsedDef(py_ast, ctx, source, filename, file_lineno)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2543177fdd4615f2afdea3ece5916639ff6dc0a6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor_docs.py
@@ -0,0 +1,6976 @@
+"""Adds docstrings to Tensor functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+from torch._torch_docs import parse_kwargs, reproducibility_notes
+
+
+def add_docstr_all(method, docstr):
+    add_docstr(getattr(torch._C.TensorBase, method), docstr)
+
+
+common_args = parse_kwargs(
+    """
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.preserve_format``.
+"""
+)
+
+new_common_args = parse_kwargs(
+    """
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+        Default: if None, same :class:`torch.dtype` as this tensor.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, same :class:`torch.device` as this tensor.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+    layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+        Default: ``torch.strided``.
+"""
+)
+
+add_docstr_all(
+    "new_tensor",
+    """
+new_tensor(data, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a new Tensor with :attr:`data` as the tensor data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+.. warning::
+
+    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+.. warning::
+
+    When data is a tensor `x`, :func:`new_tensor()` reads out 'the data' from whatever it is passed,
+    and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.clone().detach()``
+    and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+    The equivalents using ``clone()`` and ``detach()`` are recommended.
+
+Args:
+    data (array_like): The returned Tensor copies :attr:`data`.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.int8)
+    >>> data = [[0, 1], [2, 3]]
+    >>> tensor.new_tensor(data)
+    tensor([[ 0,  1],
+            [ 2,  3]], dtype=torch.int8)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_full",
+    """
+new_full(size, fill_value, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    fill_value (scalar): the number to fill the output tensor with.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.float64)
+    >>> tensor.new_full((3, 4), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty",
+    """
+new_empty(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with uninitialized data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty((2, 3))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty_strided",
+    """
+new_empty_strided(size, stride, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` and strides :attr:`stride` filled with
+uninitialized data. By default, the returned Tensor has the same
+:class:`torch.dtype` and :class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty_strided((2, 3), (3, 1))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_ones",
+    """
+new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``1``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.int32)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1,  1,  1],
+            [ 1,  1,  1]], dtype=torch.int32)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_zeros",
+    """
+new_zeros(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``0``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.float64)
+    >>> tensor.new_zeros((2, 3))
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "abs",
+    r"""
+abs() -> Tensor
+
+See :func:`torch.abs`
+""",
+)
+
+add_docstr_all(
+    "abs_",
+    r"""
+abs_() -> Tensor
+
+In-place version of :meth:`~Tensor.abs`
+""",
+)
+
+add_docstr_all(
+    "absolute",
+    r"""
+absolute() -> Tensor
+
+Alias for :func:`abs`
+""",
+)
+
+add_docstr_all(
+    "absolute_",
+    r"""
+absolute_() -> Tensor
+
+In-place version of :meth:`~Tensor.absolute`
+Alias for :func:`abs_`
+""",
+)
+
+add_docstr_all(
+    "acos",
+    r"""
+acos() -> Tensor
+
+See :func:`torch.acos`
+""",
+)
+
+add_docstr_all(
+    "acos_",
+    r"""
+acos_() -> Tensor
+
+In-place version of :meth:`~Tensor.acos`
+""",
+)
+
+add_docstr_all(
+    "arccos",
+    r"""
+arccos() -> Tensor
+
+See :func:`torch.arccos`
+""",
+)
+
+add_docstr_all(
+    "arccos_",
+    r"""
+arccos_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccos`
+""",
+)
+
+add_docstr_all(
+    "acosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.acosh`
+""",
+)
+
+add_docstr_all(
+    "acosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.acosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.arccosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccosh`
+""",
+)
+
+add_docstr_all(
+    "add",
+    r"""
+add(other, *, alpha=1) -> Tensor
+
+Add a scalar or tensor to :attr:`self` tensor. If both :attr:`alpha`
+and :attr:`other` are specified, each element of :attr:`other` is scaled by
+:attr:`alpha` before being used.
+
+When :attr:`other` is a tensor, the shape of :attr:`other` must be
+:ref:`broadcastable ` with the shape of the underlying
+tensor
+
+See :func:`torch.add`
+""",
+)
+
+add_docstr_all(
+    "add_",
+    r"""
+add_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.add`
+""",
+)
+
+add_docstr_all(
+    "addbmm",
+    r"""
+addbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addbmm_",
+    r"""
+addbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addcdiv",
+    r"""
+addcdiv(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcdiv_",
+    r"""
+addcdiv_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcmul",
+    r"""
+addcmul(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addcmul_",
+    r"""
+addcmul_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addmm",
+    r"""
+addmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmm_",
+    r"""
+addmm_(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmv",
+    r"""
+addmv(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmv`
+""",
+)
+
+add_docstr_all(
+    "addmv_",
+    r"""
+addmv_(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmv`
+""",
+)
+
+add_docstr_all(
+    "sspaddmm",
+    r"""
+sspaddmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.sspaddmm`
+""",
+)
+
+add_docstr_all(
+    "smm",
+    r"""
+smm(mat) -> Tensor
+
+See :func:`torch.smm`
+""",
+)
+
+add_docstr_all(
+    "addr",
+    r"""
+addr(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addr`
+""",
+)
+
+add_docstr_all(
+    "addr_",
+    r"""
+addr_(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addr`
+""",
+)
+
+add_docstr_all(
+    "align_as",
+    r"""
+align_as(other) -> Tensor
+
+Permutes the dimensions of the :attr:`self` tensor to match the dimension order
+in the :attr:`other` tensor, adding size-one dims for any new names.
+
+This operation is useful for explicit broadcasting by names (see examples).
+
+All of the dims of :attr:`self` must be named in order to use this method.
+The resulting tensor is a view on the original tensor.
+
+All dimension names of :attr:`self` must be present in ``other.names``.
+:attr:`other` may contain named dimensions that are not in ``self.names``;
+the output tensor has a size-one dimension for each of those new names.
+
+To align a tensor to a specific order, use :meth:`~Tensor.align_to`.
+
+Examples::
+
+    # Example 1: Applying a mask
+    >>> mask = torch.randint(2, [127, 128], dtype=torch.bool).refine_names('W', 'H')
+    >>> imgs = torch.randn(32, 128, 127, 3, names=('N', 'H', 'W', 'C'))
+    >>> imgs.masked_fill_(mask.align_as(imgs), 0)
+
+
+    # Example 2: Applying a per-channel-scale
+    >>> def scale_channels(input, scale):
+    >>>    scale = scale.refine_names('C')
+    >>>    return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names=('C',))
+    >>> imgs = torch.rand(32, 128, 128, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(32, num_channels, 128, 128, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 128, 128, 128, names=('N', 'C', 'H', 'W', 'D'))
+
+    # scale_channels is agnostic to the dimension order of the input
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "all",
+    r"""
+all(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.all`
+""",
+)
+
+add_docstr_all(
+    "allclose",
+    r"""
+allclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.allclose`
+""",
+)
+
+add_docstr_all(
+    "angle",
+    r"""
+angle() -> Tensor
+
+See :func:`torch.angle`
+""",
+)
+
+add_docstr_all(
+    "any",
+    r"""
+any(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.any`
+""",
+)
+
+add_docstr_all(
+    "apply_",
+    r"""
+apply_(callable) -> Tensor
+
+Applies the function :attr:`callable` to each element in the tensor, replacing
+each element with the value returned by :attr:`callable`.
+
+.. note::
+
+    This function only works with CPU tensors and should not be used in code
+    sections that require high performance.
+""",
+)
+
+add_docstr_all(
+    "asin",
+    r"""
+asin() -> Tensor
+
+See :func:`torch.asin`
+""",
+)
+
+add_docstr_all(
+    "asin_",
+    r"""
+asin_() -> Tensor
+
+In-place version of :meth:`~Tensor.asin`
+""",
+)
+
+add_docstr_all(
+    "arcsin",
+    r"""
+arcsin() -> Tensor
+
+See :func:`torch.arcsin`
+""",
+)
+
+add_docstr_all(
+    "arcsin_",
+    r"""
+arcsin_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsin`
+""",
+)
+
+add_docstr_all(
+    "asinh",
+    r"""
+asinh() -> Tensor
+
+See :func:`torch.asinh`
+""",
+)
+
+add_docstr_all(
+    "asinh_",
+    r"""
+asinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.asinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh",
+    r"""
+arcsinh() -> Tensor
+
+See :func:`torch.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh_",
+    r"""
+arcsinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "as_strided",
+    r"""
+as_strided(size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided`
+""",
+)
+
+add_docstr_all(
+    "as_strided_",
+    r"""
+as_strided_(size, stride, storage_offset=None) -> Tensor
+
+In-place version of :meth:`~Tensor.as_strided`
+""",
+)
+
+add_docstr_all(
+    "atan",
+    r"""
+atan() -> Tensor
+
+See :func:`torch.atan`
+""",
+)
+
+add_docstr_all(
+    "atan_",
+    r"""
+atan_() -> Tensor
+
+In-place version of :meth:`~Tensor.atan`
+""",
+)
+
+add_docstr_all(
+    "arctan",
+    r"""
+arctan() -> Tensor
+
+See :func:`torch.arctan`
+""",
+)
+
+add_docstr_all(
+    "arctan_",
+    r"""
+arctan_() -> Tensor
+
+In-place version of :meth:`~Tensor.arctan`
+""",
+)
+
+add_docstr_all(
+    "atan2",
+    r"""
+atan2(other) -> Tensor
+
+See :func:`torch.atan2`
+""",
+)
+
+add_docstr_all(
+    "atan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2",
+    r"""
+arctan2(other) -> Tensor
+
+See :func:`torch.arctan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctan2`
+""",
+)
+
+add_docstr_all(
+    "atanh",
+    r"""
+atanh() -> Tensor
+
+See :func:`torch.atanh`
+""",
+)
+
+add_docstr_all(
+    "atanh_",
+    r"""
+atanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh",
+    r"""
+arctanh() -> Tensor
+
+See :func:`torch.arctanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh_",
+    r"""
+arctanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctanh`
+""",
+)
+
+add_docstr_all(
+    "baddbmm",
+    r"""
+baddbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "baddbmm_",
+    r"""
+baddbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "bernoulli",
+    r"""
+bernoulli(*, generator=None) -> Tensor
+
+Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+floating point ``dtype``, and the result will have the same ``dtype``.
+
+See :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bernoulli_",
+    r"""
+bernoulli_(p=0.5, *, generator=None) -> Tensor
+
+Fills each location of :attr:`self` with an independent sample from
+:math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+``dtype``.
+
+:attr:`p` should either be a scalar or tensor containing probabilities to be
+used for drawing the binary random number.
+
+If it is a tensor, the :math:`\text{i}^{th}` element of :attr:`self` tensor
+will be set to a value sampled from
+:math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`. In this case `p` must have
+floating point ``dtype``.
+
+See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bincount",
+    r"""
+bincount(weights=None, minlength=0) -> Tensor
+
+See :func:`torch.bincount`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not",
+    r"""
+bitwise_not() -> Tensor
+
+See :func:`torch.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not_",
+    r"""
+bitwise_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and",
+    r"""
+bitwise_and() -> Tensor
+
+See :func:`torch.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and_",
+    r"""
+bitwise_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or",
+    r"""
+bitwise_or() -> Tensor
+
+See :func:`torch.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or_",
+    r"""
+bitwise_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor",
+    r"""
+bitwise_xor() -> Tensor
+
+See :func:`torch.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor_",
+    r"""
+bitwise_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift",
+    r"""
+bitwise_left_shift(other) -> Tensor
+
+See :func:`torch.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift_",
+    r"""
+bitwise_left_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift",
+    r"""
+bitwise_right_shift(other) -> Tensor
+
+See :func:`torch.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift_",
+    r"""
+bitwise_right_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "broadcast_to",
+    r"""
+broadcast_to(shape) -> Tensor
+
+See :func:`torch.broadcast_to`.
+""",
+)
+
+add_docstr_all(
+    "logical_and",
+    r"""
+logical_and() -> Tensor
+
+See :func:`torch.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_and_",
+    r"""
+logical_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_not",
+    r"""
+logical_not() -> Tensor
+
+See :func:`torch.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_not_",
+    r"""
+logical_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_or",
+    r"""
+logical_or() -> Tensor
+
+See :func:`torch.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_or_",
+    r"""
+logical_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_xor",
+    r"""
+logical_xor() -> Tensor
+
+See :func:`torch.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "logical_xor_",
+    r"""
+logical_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "bmm",
+    r"""
+bmm(batch2) -> Tensor
+
+See :func:`torch.bmm`
+""",
+)
+
+add_docstr_all(
+    "cauchy_",
+    r"""
+cauchy_(median=0, sigma=1, *, generator=None) -> Tensor
+
+Fills the tensor with numbers drawn from the Cauchy distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
+
+.. note::
+  Sigma (:math:`\sigma`) is used to denote the scale parameter in Cauchy distribution.
+""",
+)
+
+add_docstr_all(
+    "ceil",
+    r"""
+ceil() -> Tensor
+
+See :func:`torch.ceil`
+""",
+)
+
+add_docstr_all(
+    "ceil_",
+    r"""
+ceil_() -> Tensor
+
+In-place version of :meth:`~Tensor.ceil`
+""",
+)
+
+add_docstr_all(
+    "cholesky",
+    r"""
+cholesky(upper=False) -> Tensor
+
+See :func:`torch.cholesky`
+""",
+)
+
+add_docstr_all(
+    "cholesky_solve",
+    r"""
+cholesky_solve(input2, upper=False) -> Tensor
+
+See :func:`torch.cholesky_solve`
+""",
+)
+
+add_docstr_all(
+    "cholesky_inverse",
+    r"""
+cholesky_inverse(upper=False) -> Tensor
+
+See :func:`torch.cholesky_inverse`
+""",
+)
+
+add_docstr_all(
+    "clamp",
+    r"""
+clamp(min=None, max=None) -> Tensor
+
+See :func:`torch.clamp`
+""",
+)
+
+add_docstr_all(
+    "clamp_",
+    r"""
+clamp_(min=None, max=None) -> Tensor
+
+In-place version of :meth:`~Tensor.clamp`
+""",
+)
+
+add_docstr_all(
+    "clip",
+    r"""
+clip(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp`.
+""",
+)
+
+add_docstr_all(
+    "clip_",
+    r"""
+clip_(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp_`.
+""",
+)
+
+add_docstr_all(
+    "clone",
+    r"""
+clone(*, memory_format=torch.preserve_format) -> Tensor
+
+See :func:`torch.clone`
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "coalesce",
+    r"""
+coalesce() -> Tensor
+
+Returns a coalesced copy of :attr:`self` if :attr:`self` is an
+:ref:`uncoalesced tensor `.
+
+Returns :attr:`self` if :attr:`self` is a coalesced tensor.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+""",
+)
+
+add_docstr_all(
+    "contiguous",
+    r"""
+contiguous(memory_format=torch.contiguous_format) -> Tensor
+
+Returns a contiguous in memory tensor containing the same data as :attr:`self` tensor. If
+:attr:`self` tensor is already in the specified memory format, this function returns the
+:attr:`self` tensor.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "copy_",
+    r"""
+copy_(src, non_blocking=False) -> Tensor
+
+Copies the elements from :attr:`src` into :attr:`self` tensor and returns
+:attr:`self`.
+
+The :attr:`src` tensor must be :ref:`broadcastable `
+with the :attr:`self` tensor. It may be of a different data type or reside on a
+different device.
+
+Args:
+    src (Tensor): the source tensor to copy from
+    non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+        the copy may occur asynchronously with respect to the host. For other
+        cases, this argument has no effect.
+""",
+)
+
+add_docstr_all(
+    "conj",
+    r"""
+conj() -> Tensor
+
+See :func:`torch.conj`
+""",
+)
+
+add_docstr_all(
+    "conj_physical",
+    r"""
+conj_physical() -> Tensor
+
+See :func:`torch.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "conj_physical_",
+    r"""
+conj_physical_() -> Tensor
+
+In-place version of :meth:`~Tensor.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "resolve_conj",
+    r"""
+resolve_conj() -> Tensor
+
+See :func:`torch.resolve_conj`
+""",
+)
+
+add_docstr_all(
+    "resolve_neg",
+    r"""
+resolve_neg() -> Tensor
+
+See :func:`torch.resolve_neg`
+""",
+)
+
+add_docstr_all(
+    "copysign",
+    r"""
+copysign(other) -> Tensor
+
+See :func:`torch.copysign`
+""",
+)
+
+add_docstr_all(
+    "copysign_",
+    r"""
+copysign_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.copysign`
+""",
+)
+
+add_docstr_all(
+    "cos",
+    r"""
+cos() -> Tensor
+
+See :func:`torch.cos`
+""",
+)
+
+add_docstr_all(
+    "cos_",
+    r"""
+cos_() -> Tensor
+
+In-place version of :meth:`~Tensor.cos`
+""",
+)
+
+add_docstr_all(
+    "cosh",
+    r"""
+cosh() -> Tensor
+
+See :func:`torch.cosh`
+""",
+)
+
+add_docstr_all(
+    "cosh_",
+    r"""
+cosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.cosh`
+""",
+)
+
+add_docstr_all(
+    "cpu",
+    r"""
+cpu(memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CPU memory.
+
+If this object is already in CPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    {memory_format}
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "count_nonzero",
+    r"""
+count_nonzero(dim=None) -> Tensor
+
+See :func:`torch.count_nonzero`
+""",
+)
+
+add_docstr_all(
+    "cov",
+    r"""
+cov(*, correction=1, fweights=None, aweights=None) -> Tensor
+
+See :func:`torch.cov`
+""",
+)
+
+add_docstr_all(
+    "corrcoef",
+    r"""
+corrcoef() -> Tensor
+
+See :func:`torch.corrcoef`
+""",
+)
+
+add_docstr_all(
+    "cross",
+    r"""
+cross(other, dim=None) -> Tensor
+
+See :func:`torch.cross`
+""",
+)
+
+add_docstr_all(
+    "cuda",
+    r"""
+cuda(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CUDA memory.
+
+If this object is already in CUDA memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination GPU device.
+        Defaults to the current CUDA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "ipu",
+    r"""
+ipu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in IPU memory.
+
+If this object is already in IPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination IPU device.
+        Defaults to the current IPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "xpu",
+    r"""
+xpu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in XPU memory.
+
+If this object is already in XPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination XPU device.
+        Defaults to the current XPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "logcumsumexp",
+    r"""
+logcumsumexp(dim) -> Tensor
+
+See :func:`torch.logcumsumexp`
+""",
+)
+
+add_docstr_all(
+    "cummax",
+    r"""
+cummax(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummax`
+""",
+)
+
+add_docstr_all(
+    "cummin",
+    r"""
+cummin(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummin`
+""",
+)
+
+add_docstr_all(
+    "cumprod",
+    r"""
+cumprod(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumprod_",
+    r"""
+cumprod_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumsum",
+    r"""
+cumsum(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumsum`
+""",
+)
+
+add_docstr_all(
+    "cumsum_",
+    r"""
+cumsum_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumsum`
+""",
+)
+
+add_docstr_all(
+    "data_ptr",
+    r"""
+data_ptr() -> int
+
+Returns the address of the first element of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dequantize",
+    r"""
+dequantize() -> Tensor
+
+Given a quantized Tensor, dequantize it and return the dequantized float Tensor.
+""",
+)
+
+add_docstr_all(
+    "dense_dim",
+    r"""
+dense_dim() -> int
+
+Return the number of dense dimensions in a :ref:`sparse tensor ` :attr:`self`.
+
+.. note::
+  Returns ``len(self.shape)`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.sparse_dim` and :ref:`hybrid tensors `.
+""",
+)
+
+add_docstr_all(
+    "diag",
+    r"""
+diag(diagonal=0) -> Tensor
+
+See :func:`torch.diag`
+""",
+)
+
+add_docstr_all(
+    "diag_embed",
+    r"""
+diag_embed(offset=0, dim1=-2, dim2=-1) -> Tensor
+
+See :func:`torch.diag_embed`
+""",
+)
+
+add_docstr_all(
+    "diagflat",
+    r"""
+diagflat(offset=0) -> Tensor
+
+See :func:`torch.diagflat`
+""",
+)
+
+add_docstr_all(
+    "diagonal",
+    r"""
+diagonal(offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal`
+""",
+)
+
+add_docstr_all(
+    "diagonal_scatter",
+    r"""
+diagonal_scatter(src, offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal_scatter`
+""",
+)
+
+add_docstr_all(
+    "as_strided_scatter",
+    r"""
+as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided_scatter`
+""",
+)
+
+add_docstr_all(
+    "fill_diagonal_",
+    r"""
+fill_diagonal_(fill_value, wrap=False) -> Tensor
+
+Fill the main diagonal of a tensor that has at least 2-dimensions.
+When dims>2, all dimensions of input must be of equal length.
+This function modifies the input tensor in-place, and returns the input tensor.
+
+Arguments:
+    fill_value (Scalar): the fill value
+    wrap (bool): the diagonal 'wrapped' after N columns for tall matrices.
+
+Example::
+
+    >>> a = torch.zeros(3, 3)
+    >>> a.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+    >>> b = torch.zeros(7, 3)
+    >>> b.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.]])
+    >>> c = torch.zeros(7, 3)
+    >>> c.fill_diagonal_(5, wrap=True)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+
+""",
+)
+
+add_docstr_all(
+    "floor_divide",
+    r"""
+floor_divide(value) -> Tensor
+
+See :func:`torch.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "floor_divide_",
+    r"""
+floor_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "diff",
+    r"""
+diff(n=1, dim=-1, prepend=None, append=None) -> Tensor
+
+See :func:`torch.diff`
+""",
+)
+
+add_docstr_all(
+    "digamma",
+    r"""
+digamma() -> Tensor
+
+See :func:`torch.digamma`
+""",
+)
+
+add_docstr_all(
+    "digamma_",
+    r"""
+digamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.digamma`
+""",
+)
+
+add_docstr_all(
+    "dim",
+    r"""
+dim() -> int
+
+Returns the number of dimensions of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dist",
+    r"""
+dist(other, p=2) -> Tensor
+
+See :func:`torch.dist`
+""",
+)
+
+add_docstr_all(
+    "div",
+    r"""
+div(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.div`
+""",
+)
+
+add_docstr_all(
+    "div_",
+    r"""
+div_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.div`
+""",
+)
+
+add_docstr_all(
+    "divide",
+    r"""
+divide(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.divide`
+""",
+)
+
+add_docstr_all(
+    "divide_",
+    r"""
+divide_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.divide`
+""",
+)
+
+add_docstr_all(
+    "dot",
+    r"""
+dot(other) -> Tensor
+
+See :func:`torch.dot`
+""",
+)
+
+add_docstr_all(
+    "element_size",
+    r"""
+element_size() -> int
+
+Returns the size in bytes of an individual element.
+
+Example::
+
+    >>> torch.tensor([]).element_size()
+    4
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
+    1
+
+""",
+)
+
+add_docstr_all(
+    "eq",
+    r"""
+eq(other) -> Tensor
+
+See :func:`torch.eq`
+""",
+)
+
+add_docstr_all(
+    "eq_",
+    r"""
+eq_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.eq`
+""",
+)
+
+add_docstr_all(
+    "equal",
+    r"""
+equal(other) -> bool
+
+See :func:`torch.equal`
+""",
+)
+
+add_docstr_all(
+    "erf",
+    r"""
+erf() -> Tensor
+
+See :func:`torch.erf`
+""",
+)
+
+add_docstr_all(
+    "erf_",
+    r"""
+erf_() -> Tensor
+
+In-place version of :meth:`~Tensor.erf`
+""",
+)
+
+add_docstr_all(
+    "erfc",
+    r"""
+erfc() -> Tensor
+
+See :func:`torch.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfc_",
+    r"""
+erfc_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfinv",
+    r"""
+erfinv() -> Tensor
+
+See :func:`torch.erfinv`
+""",
+)
+
+add_docstr_all(
+    "erfinv_",
+    r"""
+erfinv_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfinv`
+""",
+)
+
+add_docstr_all(
+    "exp",
+    r"""
+exp() -> Tensor
+
+See :func:`torch.exp`
+""",
+)
+
+add_docstr_all(
+    "exp_",
+    r"""
+exp_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp`
+""",
+)
+
+add_docstr_all(
+    "exp2",
+    r"""
+exp2() -> Tensor
+
+See :func:`torch.exp2`
+""",
+)
+
+add_docstr_all(
+    "exp2_",
+    r"""
+exp2_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp2`
+""",
+)
+
+add_docstr_all(
+    "expm1",
+    r"""
+expm1() -> Tensor
+
+See :func:`torch.expm1`
+""",
+)
+
+add_docstr_all(
+    "expm1_",
+    r"""
+expm1_() -> Tensor
+
+In-place version of :meth:`~Tensor.expm1`
+""",
+)
+
+add_docstr_all(
+    "exponential_",
+    r"""
+exponential_(lambd=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the PDF (probability density function):
+
+.. math::
+
+    f(x) = \lambda e^{-\lambda x}, x > 0
+
+.. note::
+  In probability theory, exponential distribution is supported on interval [0, :math:`\inf`) (i.e., :math:`x >= 0`)
+  implying that zero can be sampled from the exponential distribution.
+  However, :func:`torch.Tensor.exponential_` does not sample zero,
+  which means that its actual support is the interval (0, :math:`\inf`).
+
+  Note that :func:`torch.distributions.exponential.Exponential` is supported on the interval [0, :math:`\inf`) and can sample zero.
+""",
+)
+
+add_docstr_all(
+    "fill_",
+    r"""
+fill_(value) -> Tensor
+
+Fills :attr:`self` tensor with the specified value.
+""",
+)
+
+add_docstr_all(
+    "floor",
+    r"""
+floor() -> Tensor
+
+See :func:`torch.floor`
+""",
+)
+
+add_docstr_all(
+    "flip",
+    r"""
+flip(dims) -> Tensor
+
+See :func:`torch.flip`
+""",
+)
+
+add_docstr_all(
+    "fliplr",
+    r"""
+fliplr() -> Tensor
+
+See :func:`torch.fliplr`
+""",
+)
+
+add_docstr_all(
+    "flipud",
+    r"""
+flipud() -> Tensor
+
+See :func:`torch.flipud`
+""",
+)
+
+add_docstr_all(
+    "roll",
+    r"""
+roll(shifts, dims) -> Tensor
+
+See :func:`torch.roll`
+""",
+)
+
+add_docstr_all(
+    "floor_",
+    r"""
+floor_() -> Tensor
+
+In-place version of :meth:`~Tensor.floor`
+""",
+)
+
+add_docstr_all(
+    "fmod",
+    r"""
+fmod(divisor) -> Tensor
+
+See :func:`torch.fmod`
+""",
+)
+
+add_docstr_all(
+    "fmod_",
+    r"""
+fmod_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.fmod`
+""",
+)
+
+add_docstr_all(
+    "frac",
+    r"""
+frac() -> Tensor
+
+See :func:`torch.frac`
+""",
+)
+
+add_docstr_all(
+    "frac_",
+    r"""
+frac_() -> Tensor
+
+In-place version of :meth:`~Tensor.frac`
+""",
+)
+
+add_docstr_all(
+    "frexp",
+    r"""
+frexp(input) -> (Tensor mantissa, Tensor exponent)
+
+See :func:`torch.frexp`
+""",
+)
+
+add_docstr_all(
+    "flatten",
+    r"""
+flatten(start_dim=0, end_dim=-1) -> Tensor
+
+See :func:`torch.flatten`
+""",
+)
+
+add_docstr_all(
+    "gather",
+    r"""
+gather(dim, index) -> Tensor
+
+See :func:`torch.gather`
+""",
+)
+
+add_docstr_all(
+    "gcd",
+    r"""
+gcd(other) -> Tensor
+
+See :func:`torch.gcd`
+""",
+)
+
+add_docstr_all(
+    "gcd_",
+    r"""
+gcd_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gcd`
+""",
+)
+
+add_docstr_all(
+    "ge",
+    r"""
+ge(other) -> Tensor
+
+See :func:`torch.ge`.
+""",
+)
+
+add_docstr_all(
+    "ge_",
+    r"""
+ge_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ge`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal",
+    r"""
+greater_equal(other) -> Tensor
+
+See :func:`torch.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal_",
+    r"""
+greater_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "geometric_",
+    r"""
+geometric_(p, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the geometric distribution:
+
+.. math::
+
+    P(X=k) = (1 - p)^{k - 1} p, k = 1, 2, ...
+
+.. note::
+  :func:`torch.Tensor.geometric_` `k`-th trial is the first success hence draws samples in :math:`\{1, 2, \ldots\}`, whereas
+  :func:`torch.distributions.geometric.Geometric` :math:`(k+1)`-th trial is the first success
+  hence draws samples in :math:`\{0, 1, \ldots\}`.
+""",
+)
+
+add_docstr_all(
+    "geqrf",
+    r"""
+geqrf() -> (Tensor, Tensor)
+
+See :func:`torch.geqrf`
+""",
+)
+
+add_docstr_all(
+    "ger",
+    r"""
+ger(vec2) -> Tensor
+
+See :func:`torch.ger`
+""",
+)
+
+add_docstr_all(
+    "inner",
+    r"""
+inner(other) -> Tensor
+
+See :func:`torch.inner`.
+""",
+)
+
+add_docstr_all(
+    "outer",
+    r"""
+outer(vec2) -> Tensor
+
+See :func:`torch.outer`.
+""",
+)
+
+add_docstr_all(
+    "hypot",
+    r"""
+hypot(other) -> Tensor
+
+See :func:`torch.hypot`
+""",
+)
+
+add_docstr_all(
+    "hypot_",
+    r"""
+hypot_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.hypot`
+""",
+)
+
+add_docstr_all(
+    "i0",
+    r"""
+i0() -> Tensor
+
+See :func:`torch.i0`
+""",
+)
+
+add_docstr_all(
+    "i0_",
+    r"""
+i0_() -> Tensor
+
+In-place version of :meth:`~Tensor.i0`
+""",
+)
+
+add_docstr_all(
+    "igamma",
+    r"""
+igamma(other) -> Tensor
+
+See :func:`torch.igamma`
+""",
+)
+
+add_docstr_all(
+    "igamma_",
+    r"""
+igamma_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.igamma`
+""",
+)
+
+add_docstr_all(
+    "igammac",
+    r"""
+igammac(other) -> Tensor
+See :func:`torch.igammac`
+""",
+)
+
+add_docstr_all(
+    "igammac_",
+    r"""
+igammac_(other) -> Tensor
+In-place version of :meth:`~Tensor.igammac`
+""",
+)
+
+add_docstr_all(
+    "indices",
+    r"""
+indices() -> Tensor
+
+Return the indices tensor of a :ref:`sparse COO tensor `.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.values`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "get_device",
+    r"""
+get_device() -> Device ordinal (Integer)
+
+For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
+For CPU tensors, this function returns `-1`.
+
+Example::
+
+    >>> x = torch.randn(3, 4, 5, device='cuda:0')
+    >>> x.get_device()
+    0
+    >>> x.cpu().get_device()
+    -1
+""",
+)
+
+add_docstr_all(
+    "values",
+    r"""
+values() -> Tensor
+
+Return the values tensor of a :ref:`sparse COO tensor `.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.indices`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "gt",
+    r"""
+gt(other) -> Tensor
+
+See :func:`torch.gt`.
+""",
+)
+
+add_docstr_all(
+    "gt_",
+    r"""
+gt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gt`.
+""",
+)
+
+add_docstr_all(
+    "greater",
+    r"""
+greater(other) -> Tensor
+
+See :func:`torch.greater`.
+""",
+)
+
+add_docstr_all(
+    "greater_",
+    r"""
+greater_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater`.
+""",
+)
+
+add_docstr_all(
+    "has_names",
+    r"""
+Is ``True`` if any of this tensor's dimensions are named. Otherwise, is ``False``.
+""",
+)
+
+add_docstr_all(
+    "hardshrink",
+    r"""
+hardshrink(lambd=0.5) -> Tensor
+
+See :func:`torch.nn.functional.hardshrink`
+""",
+)
+
+add_docstr_all(
+    "heaviside",
+    r"""
+heaviside(values) -> Tensor
+
+See :func:`torch.heaviside`
+""",
+)
+
+add_docstr_all(
+    "heaviside_",
+    r"""
+heaviside_(values) -> Tensor
+
+In-place version of :meth:`~Tensor.heaviside`
+""",
+)
+
+add_docstr_all(
+    "histc",
+    r"""
+histc(bins=100, min=0, max=0) -> Tensor
+
+See :func:`torch.histc`
+""",
+)
+
+add_docstr_all(
+    "histogram",
+    r"""
+histogram(input, bins, *, range=None, weight=None, density=False) -> (Tensor, Tensor)
+
+See :func:`torch.histogram`
+""",
+)
+
+add_docstr_all(
+    "index_add_",
+    r"""
+index_add_(dim, index, source, *, alpha=1) -> Tensor
+
+Accumulate the elements of :attr:`alpha` times ``source`` into the :attr:`self`
+tensor by adding to the indices in the order given in :attr:`index`. For example,
+if ``dim == 0``, ``index[i] == j``, and ``alpha=-1``, then the ``i``\ th row of
+``source`` is subtracted from the ``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor the output is given as::
+
+    self[index[i], :, :] += alpha * src[i, :, :]  # if dim == 0
+    self[:, index[i], :] += alpha * src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] += alpha * src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+            should have dtype either `torch.int64` or `torch.int32`
+    source (Tensor): the tensor containing values to add
+
+Keyword args:
+    alpha (Number): the scalar multiplier for ``source``
+
+Example::
+
+    >>> x = torch.ones(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_add_(0, index, t)
+    tensor([[  2.,   3.,   4.],
+            [  1.,   1.,   1.],
+            [  8.,   9.,  10.],
+            [  1.,   1.,   1.],
+            [  5.,   6.,   7.]])
+    >>> x.index_add_(0, index, t, alpha=-1)
+    tensor([[  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_copy_",
+    r"""
+index_copy_(dim, index, tensor) -> Tensor
+
+Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+.. note::
+    If :attr:`index` contains duplicate entries, multiple elements from
+    :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+    is nondeterministic since it depends on which copy occurs last.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`tensor` to select from
+    tensor (Tensor): the tensor containing values to copy
+
+Example::
+
+    >>> x = torch.zeros(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_copy_(0, index, t)
+    tensor([[ 1.,  2.,  3.],
+            [ 0.,  0.,  0.],
+            [ 7.,  8.,  9.],
+            [ 0.,  0.,  0.],
+            [ 4.,  5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "index_fill_",
+    r"""
+index_fill_(dim, index, value) -> Tensor
+
+Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+selecting the indices in the order given in :attr:`index`.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`self` tensor to fill in
+    value (float): the value to fill with
+
+Example::
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 2])
+    >>> x.index_fill_(1, index, -1)
+    tensor([[-1.,  2., -1.],
+            [-1.,  5., -1.],
+            [-1.,  8., -1.]])
+""",
+)
+
+add_docstr_all(
+    "index_put_",
+    r"""
+index_put_(indices, values, accumulate=False) -> Tensor
+
+Puts values from the tensor :attr:`values` into the tensor :attr:`self` using
+the indices specified in :attr:`indices` (which is a tuple of Tensors). The
+expression ``tensor.index_put_(indices, values)`` is equivalent to
+``tensor[indices] = values``. Returns :attr:`self`.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
+contain duplicate elements.
+
+Args:
+    indices (tuple of LongTensor): tensors used to index into `self`.
+    values (Tensor): tensor of same dtype as `self`.
+    accumulate (bool): whether to accumulate into self
+""",
+)
+
+add_docstr_all(
+    "index_put",
+    r"""
+index_put(indices, values, accumulate=False) -> Tensor
+
+Out-place version of :meth:`~Tensor.index_put_`.
+""",
+)
+
+add_docstr_all(
+    "index_reduce_",
+    r"""
+index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor
+
+Accumulate the elements of ``source`` into the :attr:`self`
+tensor by accumulating to the indices in the order given in :attr:`index`
+using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``,
+``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th
+row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
+:obj:`include_self="True"`, the values in the :attr:`self` tensor are included
+in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+to are treated as if they were filled with the reduction identites.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i], :, :] *= src[i, :, :]  # if dim == 0
+    self[:, index[i], :] *= src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] *= src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    This function only supports floating point tensors.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+        should have dtype either `torch.int64` or `torch.int32`
+    source (FloatTensor): the tensor containing values to accumulate
+    reduce (str): the reduction operation to apply
+        (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+
+Keyword args:
+    include_self (bool): whether the elements from the ``self`` tensor are
+        included in the reduction
+
+Example::
+
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2, 0])
+    >>> x.index_reduce_(0, index, t, 'prod')
+    tensor([[20., 44., 72.],
+            [ 2.,  2.,  2.],
+            [14., 16., 18.],
+            [ 2.,  2.,  2.],
+            [ 8., 10., 12.]])
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> x.index_reduce_(0, index, t, 'prod', include_self=False)
+    tensor([[10., 22., 36.],
+            [ 2.,  2.,  2.],
+            [ 7.,  8.,  9.],
+            [ 2.,  2.,  2.],
+            [ 4.,  5.,  6.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_select",
+    r"""
+index_select(dim, index) -> Tensor
+
+See :func:`torch.index_select`
+""",
+)
+
+add_docstr_all(
+    "sparse_mask",
+    r"""
+sparse_mask(mask) -> Tensor
+
+Returns a new :ref:`sparse tensor ` with values from a
+strided tensor :attr:`self` filtered by the indices of the sparse
+tensor :attr:`mask`. The values of :attr:`mask` sparse tensor are
+ignored. :attr:`self` and :attr:`mask` tensors must have the same
+shape.
+
+.. note::
+
+  The returned sparse tensor might contain duplicate values if :attr:`mask`
+  is not coalesced. It is therefore advisable to pass ``mask.coalesce()``
+  if such behavior is not desired.
+
+.. note::
+
+  The returned sparse tensor has the same indices as the sparse tensor
+  :attr:`mask`, even when the corresponding values in :attr:`self` are
+  zeros.
+
+Args:
+    mask (Tensor): a sparse tensor whose indices are used as a filter
+
+Example::
+
+    >>> nse = 5
+    >>> dims = (5, 5, 2, 2)
+    >>> I = torch.cat([torch.randint(0, dims[0], size=(nse,)),
+    ...                torch.randint(0, dims[1], size=(nse,))], 0).reshape(2, nse)
+    >>> V = torch.randn(nse, dims[2], dims[3])
+    >>> S = torch.sparse_coo_tensor(I, V, dims).coalesce()
+    >>> D = torch.randn(dims)
+    >>> D.sparse_mask(S)
+    tensor(indices=tensor([[0, 0, 0, 2],
+                           [0, 1, 4, 3]]),
+           values=tensor([[[ 1.6550,  0.2397],
+                           [-0.1611, -0.0779]],
+
+                          [[ 0.2326, -1.0558],
+                           [ 1.4711,  1.9678]],
+
+                          [[-0.5138, -0.0411],
+                           [ 1.9417,  0.5158]],
+
+                          [[ 0.0793,  0.0036],
+                           [-0.2569, -0.1055]]]),
+           size=(5, 5, 2, 2), nnz=4, layout=torch.sparse_coo)
+""",
+)
+
+add_docstr_all(
+    "inverse",
+    r"""
+inverse() -> Tensor
+
+See :func:`torch.inverse`
+""",
+)
+
+add_docstr_all(
+    "isnan",
+    r"""
+isnan() -> Tensor
+
+See :func:`torch.isnan`
+""",
+)
+
+add_docstr_all(
+    "isinf",
+    r"""
+isinf() -> Tensor
+
+See :func:`torch.isinf`
+""",
+)
+
+add_docstr_all(
+    "isposinf",
+    r"""
+isposinf() -> Tensor
+
+See :func:`torch.isposinf`
+""",
+)
+
+add_docstr_all(
+    "isneginf",
+    r"""
+isneginf() -> Tensor
+
+See :func:`torch.isneginf`
+""",
+)
+
+add_docstr_all(
+    "isfinite",
+    r"""
+isfinite() -> Tensor
+
+See :func:`torch.isfinite`
+""",
+)
+
+add_docstr_all(
+    "isclose",
+    r"""
+isclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.isclose`
+""",
+)
+
+add_docstr_all(
+    "isreal",
+    r"""
+isreal() -> Tensor
+
+See :func:`torch.isreal`
+""",
+)
+
+add_docstr_all(
+    "is_coalesced",
+    r"""
+is_coalesced() -> bool
+
+Returns ``True`` if :attr:`self` is a :ref:`sparse COO tensor
+` that is coalesced, ``False`` otherwise.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See :meth:`coalesce` and :ref:`uncoalesced tensors `.
+""",
+)
+
+add_docstr_all(
+    "is_contiguous",
+    r"""
+is_contiguous(memory_format=torch.contiguous_format) -> bool
+
+Returns True if :attr:`self` tensor is contiguous in memory in the order specified
+by memory format.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): Specifies memory allocation
+        order. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "is_pinned",
+    r"""
+Returns true if this tensor resides in pinned memory.
+""",
+)
+
+add_docstr_all(
+    "is_floating_point",
+    r"""
+is_floating_point() -> bool
+
+Returns True if the data type of :attr:`self` is a floating point data type.
+""",
+)
+
+add_docstr_all(
+    "is_complex",
+    r"""
+is_complex() -> bool
+
+Returns True if the data type of :attr:`self` is a complex data type.
+""",
+)
+
+add_docstr_all(
+    "is_inference",
+    r"""
+is_inference() -> bool
+
+See :func:`torch.is_inference`
+""",
+)
+
+add_docstr_all(
+    "is_conj",
+    r"""
+is_conj() -> bool
+
+Returns True if the conjugate bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_neg",
+    r"""
+is_neg() -> bool
+
+Returns True if the negative bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_signed",
+    r"""
+is_signed() -> bool
+
+Returns True if the data type of :attr:`self` is a signed data type.
+""",
+)
+
+add_docstr_all(
+    "is_set_to",
+    r"""
+is_set_to(tensor) -> bool
+
+Returns True if both tensors are pointing to the exact same memory (same
+storage, offset, size and stride).
+""",
+)
+
+add_docstr_all(
+    "item",
+    r"""
+item() -> number
+
+Returns the value of this tensor as a standard Python number. This only works
+for tensors with one element. For other cases, see :meth:`~Tensor.tolist`.
+
+This operation is not differentiable.
+
+Example::
+
+    >>> x = torch.tensor([1.0])
+    >>> x.item()
+    1.0
+
+""",
+)
+
+add_docstr_all(
+    "kron",
+    r"""
+kron(other) -> Tensor
+
+See :func:`torch.kron`
+""",
+)
+
+add_docstr_all(
+    "kthvalue",
+    r"""
+kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.kthvalue`
+""",
+)
+
+add_docstr_all(
+    "ldexp",
+    r"""
+ldexp(other) -> Tensor
+
+See :func:`torch.ldexp`
+""",
+)
+
+add_docstr_all(
+    "ldexp_",
+    r"""
+ldexp_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ldexp`
+""",
+)
+
+add_docstr_all(
+    "lcm",
+    r"""
+lcm(other) -> Tensor
+
+See :func:`torch.lcm`
+""",
+)
+
+add_docstr_all(
+    "lcm_",
+    r"""
+lcm_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lcm`
+""",
+)
+
+add_docstr_all(
+    "le",
+    r"""
+le(other) -> Tensor
+
+See :func:`torch.le`.
+""",
+)
+
+add_docstr_all(
+    "le_",
+    r"""
+le_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.le`.
+""",
+)
+
+add_docstr_all(
+    "less_equal",
+    r"""
+less_equal(other) -> Tensor
+
+See :func:`torch.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "less_equal_",
+    r"""
+less_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "lerp",
+    r"""
+lerp(end, weight) -> Tensor
+
+See :func:`torch.lerp`
+""",
+)
+
+add_docstr_all(
+    "lerp_",
+    r"""
+lerp_(end, weight) -> Tensor
+
+In-place version of :meth:`~Tensor.lerp`
+""",
+)
+
+add_docstr_all(
+    "lgamma",
+    r"""
+lgamma() -> Tensor
+
+See :func:`torch.lgamma`
+""",
+)
+
+add_docstr_all(
+    "lgamma_",
+    r"""
+lgamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.lgamma`
+""",
+)
+
+add_docstr_all(
+    "log",
+    r"""
+log() -> Tensor
+
+See :func:`torch.log`
+""",
+)
+
+add_docstr_all(
+    "log_",
+    r"""
+log_() -> Tensor
+
+In-place version of :meth:`~Tensor.log`
+""",
+)
+
+add_docstr_all(
+    "log10",
+    r"""
+log10() -> Tensor
+
+See :func:`torch.log10`
+""",
+)
+
+add_docstr_all(
+    "log10_",
+    r"""
+log10_() -> Tensor
+
+In-place version of :meth:`~Tensor.log10`
+""",
+)
+
+add_docstr_all(
+    "log1p",
+    r"""
+log1p() -> Tensor
+
+See :func:`torch.log1p`
+""",
+)
+
+add_docstr_all(
+    "log1p_",
+    r"""
+log1p_() -> Tensor
+
+In-place version of :meth:`~Tensor.log1p`
+""",
+)
+
+add_docstr_all(
+    "log2",
+    r"""
+log2() -> Tensor
+
+See :func:`torch.log2`
+""",
+)
+
+add_docstr_all(
+    "log2_",
+    r"""
+log2_() -> Tensor
+
+In-place version of :meth:`~Tensor.log2`
+""",
+)
+
+add_docstr_all(
+    "logaddexp",
+    r"""
+logaddexp(other) -> Tensor
+
+See :func:`torch.logaddexp`
+""",
+)
+
+add_docstr_all(
+    "logaddexp2",
+    r"""
+logaddexp2(other) -> Tensor
+
+See :func:`torch.logaddexp2`
+""",
+)
+
+add_docstr_all(
+    "log_normal_",
+    r"""
+log_normal_(mean=1, std=2, *, generator=None)
+
+Fills :attr:`self` tensor with numbers samples from the log-normal distribution
+parameterized by the given mean :math:`\mu` and standard deviation
+:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+standard deviation of the underlying normal distribution, and not of the
+returned distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
+""",
+)
+
+add_docstr_all(
+    "logsumexp",
+    r"""
+logsumexp(dim, keepdim=False) -> Tensor
+
+See :func:`torch.logsumexp`
+""",
+)
+
+add_docstr_all(
+    "lt",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.lt`.
+""",
+)
+
+add_docstr_all(
+    "lt_",
+    r"""
+lt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lt`.
+""",
+)
+
+add_docstr_all(
+    "less",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.less`.
+""",
+)
+
+add_docstr_all(
+    "less_",
+    r"""
+less_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less`.
+""",
+)
+
+add_docstr_all(
+    "lu_solve",
+    r"""
+lu_solve(LU_data, LU_pivots) -> Tensor
+
+See :func:`torch.lu_solve`
+""",
+)
+
+add_docstr_all(
+    "map_",
+    r"""
+map_(tensor, callable)
+
+Applies :attr:`callable` for each element in :attr:`self` tensor and the given
+:attr:`tensor` and stores the results in :attr:`self` tensor. :attr:`self` tensor and
+the given :attr:`tensor` must be :ref:`broadcastable `.
+
+The :attr:`callable` should have the signature::
+
+    def callable(a, b) -> number
+""",
+)
+
+add_docstr_all(
+    "masked_scatter_",
+    r"""
+masked_scatter_(mask, source)
+
+Copies elements from :attr:`source` into :attr:`self` tensor at positions where
+the :attr:`mask` is True. Elements from :attr:`source` are copied into :attr:`self`
+starting at position 0 of :attr:`source` and continuing in order one-by-one for each
+occurrence of :attr:`mask` being True.
+The shape of :attr:`mask` must be :ref:`broadcastable `
+with the shape of the underlying tensor. The :attr:`source` should have at least
+as many elements as the number of ones in :attr:`mask`.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    source (Tensor): the tensor to copy from
+
+.. note::
+
+    The :attr:`mask` operates on the :attr:`self` tensor, not on the given
+    :attr:`source` tensor.
+
+Example:
+
+    >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter_(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "masked_fill_",
+    r"""
+masked_fill_(mask, value)
+
+Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+True. The shape of :attr:`mask` must be
+:ref:`broadcastable ` with the shape of the underlying
+tensor.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    value (float): the value to fill in with
+""",
+)
+
+add_docstr_all(
+    "masked_select",
+    r"""
+masked_select(mask) -> Tensor
+
+See :func:`torch.masked_select`
+""",
+)
+
+add_docstr_all(
+    "matrix_power",
+    r"""
+matrix_power(n) -> Tensor
+
+.. note:: :meth:`~Tensor.matrix_power` is deprecated, use :func:`torch.linalg.matrix_power` instead.
+
+Alias for :func:`torch.linalg.matrix_power`
+""",
+)
+
+add_docstr_all(
+    "matrix_exp",
+    r"""
+matrix_exp() -> Tensor
+
+See :func:`torch.matrix_exp`
+""",
+)
+
+add_docstr_all(
+    "max",
+    r"""
+max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.max`
+""",
+)
+
+add_docstr_all(
+    "amax",
+    r"""
+amax(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amax`
+""",
+)
+
+add_docstr_all(
+    "maximum",
+    r"""
+maximum(other) -> Tensor
+
+See :func:`torch.maximum`
+""",
+)
+
+add_docstr_all(
+    "fmax",
+    r"""
+fmax(other) -> Tensor
+
+See :func:`torch.fmax`
+""",
+)
+
+add_docstr_all(
+    "argmax",
+    r"""
+argmax(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmax`
+""",
+)
+
+add_docstr_all(
+    "argwhere",
+    r"""
+argwhere() -> Tensor
+
+See :func:`torch.argwhere`
+""",
+)
+
+add_docstr_all(
+    "mean",
+    r"""
+mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.mean`
+""",
+)
+
+add_docstr_all(
+    "nanmean",
+    r"""
+nanmean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.nanmean`
+""",
+)
+
+add_docstr_all(
+    "median",
+    r"""
+median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.median`
+""",
+)
+
+add_docstr_all(
+    "nanmedian",
+    r"""
+nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.nanmedian`
+""",
+)
+
+add_docstr_all(
+    "min",
+    r"""
+min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.min`
+""",
+)
+
+add_docstr_all(
+    "amin",
+    r"""
+amin(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amin`
+""",
+)
+
+add_docstr_all(
+    "minimum",
+    r"""
+minimum(other) -> Tensor
+
+See :func:`torch.minimum`
+""",
+)
+
+add_docstr_all(
+    "aminmax",
+    r"""
+aminmax(*, dim=None, keepdim=False) -> (Tensor min, Tensor max)
+
+See :func:`torch.aminmax`
+""",
+)
+
+add_docstr_all(
+    "fmin",
+    r"""
+fmin(other) -> Tensor
+
+See :func:`torch.fmin`
+""",
+)
+
+add_docstr_all(
+    "argmin",
+    r"""
+argmin(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmin`
+""",
+)
+
+add_docstr_all(
+    "mm",
+    r"""
+mm(mat2) -> Tensor
+
+See :func:`torch.mm`
+""",
+)
+
+add_docstr_all(
+    "mode",
+    r"""
+mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.mode`
+""",
+)
+
+add_docstr_all(
+    "movedim",
+    r"""
+movedim(source, destination) -> Tensor
+
+See :func:`torch.movedim`
+""",
+)
+
+add_docstr_all(
+    "moveaxis",
+    r"""
+moveaxis(source, destination) -> Tensor
+
+See :func:`torch.moveaxis`
+""",
+)
+
+add_docstr_all(
+    "mul",
+    r"""
+mul(value) -> Tensor
+
+See :func:`torch.mul`.
+""",
+)
+
+add_docstr_all(
+    "mul_",
+    r"""
+mul_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.mul`.
+""",
+)
+
+add_docstr_all(
+    "multiply",
+    r"""
+multiply(value) -> Tensor
+
+See :func:`torch.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multiply_",
+    r"""
+multiply_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multinomial",
+    r"""
+multinomial(num_samples, replacement=False, *, generator=None) -> Tensor
+
+See :func:`torch.multinomial`
+""",
+)
+
+add_docstr_all(
+    "mv",
+    r"""
+mv(vec) -> Tensor
+
+See :func:`torch.mv`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma",
+    r"""
+mvlgamma(p) -> Tensor
+
+See :func:`torch.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma_",
+    r"""
+mvlgamma_(p) -> Tensor
+
+In-place version of :meth:`~Tensor.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "narrow",
+    r"""
+narrow(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow`.
+""",
+)
+
+add_docstr_all(
+    "narrow_copy",
+    r"""
+narrow_copy(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow_copy`.
+""",
+)
+
+add_docstr_all(
+    "ndimension",
+    r"""
+ndimension() -> int
+
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "nan_to_num",
+    r"""
+nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+See :func:`torch.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "nan_to_num_",
+    r"""
+nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+In-place version of :meth:`~Tensor.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "ne",
+    r"""
+ne(other) -> Tensor
+
+See :func:`torch.ne`.
+""",
+)
+
+add_docstr_all(
+    "ne_",
+    r"""
+ne_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ne`.
+""",
+)
+
+add_docstr_all(
+    "not_equal",
+    r"""
+not_equal(other) -> Tensor
+
+See :func:`torch.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "not_equal_",
+    r"""
+not_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "neg",
+    r"""
+neg() -> Tensor
+
+See :func:`torch.neg`
+""",
+)
+
+add_docstr_all(
+    "negative",
+    r"""
+negative() -> Tensor
+
+See :func:`torch.negative`
+""",
+)
+
+add_docstr_all(
+    "neg_",
+    r"""
+neg_() -> Tensor
+
+In-place version of :meth:`~Tensor.neg`
+""",
+)
+
+add_docstr_all(
+    "negative_",
+    r"""
+negative_() -> Tensor
+
+In-place version of :meth:`~Tensor.negative`
+""",
+)
+
+add_docstr_all(
+    "nelement",
+    r"""
+nelement() -> int
+
+Alias for :meth:`~Tensor.numel`
+""",
+)
+
+add_docstr_all(
+    "nextafter",
+    r"""
+nextafter(other) -> Tensor
+See :func:`torch.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nextafter_",
+    r"""
+nextafter_(other) -> Tensor
+In-place version of :meth:`~Tensor.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nonzero",
+    r"""
+nonzero() -> LongTensor
+
+See :func:`torch.nonzero`
+""",
+)
+
+add_docstr_all(
+    "nonzero_static",
+    r"""
+nonzero_static(input, *, size, fill_value=-1) -> Tensor
+
+Returns a 2-D tensor where each row is the index for a non-zero value.
+The returned Tensor has the same `torch.dtype` as `torch.nonzero()`.
+
+Args:
+    input (Tensor): the input tensor to count non-zero elements.
+
+Keyword args:
+    size (int): the size of non-zero elements expected to be included in the out
+        tensor. Pad the out tensor with `fill_value` if the `size` is larger
+        than total number of non-zero elements, truncate out tensor if `size`
+        is smaller. The size must be a non-negative integer.
+    fill_value (int): the value to fill the output tensor with when `size` is larger
+        than the total number of non-zero elements. Default is `-1` to represent
+        invalid index.
+
+Example:
+
+    # Example 1: Padding
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 4
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0],
+            [  1,   1],
+            [  -1, -1]], dtype=torch.int64)
+
+    # Example 2: Truncating
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0]], dtype=torch.int64)
+
+    # Example 3: 0 size
+    >>> input_tensor = torch.tensor([10])
+    >>> static_size = 0
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(0, 1), dtype=torch.int64)
+
+    # Example 4: 0 rank input
+    >>> input_tensor = torch.tensor(10)
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(2, 0), dtype=torch.int64)
+""",
+)
+
+add_docstr_all(
+    "norm",
+    r"""
+norm(p=2, dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.norm`
+""",
+)
+
+add_docstr_all(
+    "normal_",
+    r"""
+normal_(mean=0, std=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements samples from the normal distribution
+parameterized by :attr:`mean` and :attr:`std`.
+""",
+)
+
+add_docstr_all(
+    "numel",
+    r"""
+numel() -> int
+
+See :func:`torch.numel`
+""",
+)
+
+add_docstr_all(
+    "numpy",
+    r"""
+numpy(*, force=False) -> numpy.ndarray
+
+Returns the tensor as a NumPy :class:`ndarray`.
+
+If :attr:`force` is ``False`` (the default), the conversion
+is performed only if the tensor is on the CPU, does not require grad,
+does not have its conjugate bit set, and is a dtype and layout that
+NumPy supports. The returned ndarray and the tensor will share their
+storage, so changes to the tensor will be reflected in the ndarray
+and vice versa.
+
+If :attr:`force` is ``True`` this is equivalent to
+calling ``t.detach().cpu().resolve_conj().resolve_neg().numpy()``.
+If the tensor isn't on the CPU or the conjugate or negative bit is set,
+the tensor won't share its storage with the returned ndarray.
+Setting :attr:`force` to ``True`` can be a useful shorthand.
+
+Args:
+    force (bool): if ``True``, the ndarray may be a copy of the tensor
+               instead of always sharing memory, defaults to ``False``.
+""",
+)
+
+add_docstr_all(
+    "orgqr",
+    r"""
+orgqr(input2) -> Tensor
+
+See :func:`torch.orgqr`
+""",
+)
+
+add_docstr_all(
+    "ormqr",
+    r"""
+ormqr(input2, input3, left=True, transpose=False) -> Tensor
+
+See :func:`torch.ormqr`
+""",
+)
+
+add_docstr_all(
+    "permute",
+    r"""
+permute(*dims) -> Tensor
+
+See :func:`torch.permute`
+""",
+)
+
+add_docstr_all(
+    "polygamma",
+    r"""
+polygamma(n) -> Tensor
+
+See :func:`torch.polygamma`
+""",
+)
+
+add_docstr_all(
+    "polygamma_",
+    r"""
+polygamma_(n) -> Tensor
+
+In-place version of :meth:`~Tensor.polygamma`
+""",
+)
+
+add_docstr_all(
+    "positive",
+    r"""
+positive() -> Tensor
+
+See :func:`torch.positive`
+""",
+)
+
+add_docstr_all(
+    "pow",
+    r"""
+pow(exponent) -> Tensor
+
+See :func:`torch.pow`
+""",
+)
+
+add_docstr_all(
+    "pow_",
+    r"""
+pow_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.pow`
+""",
+)
+
+add_docstr_all(
+    "float_power",
+    r"""
+float_power(exponent) -> Tensor
+
+See :func:`torch.float_power`
+""",
+)
+
+add_docstr_all(
+    "float_power_",
+    r"""
+float_power_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.float_power`
+""",
+)
+
+add_docstr_all(
+    "prod",
+    r"""
+prod(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.prod`
+""",
+)
+
+add_docstr_all(
+    "put_",
+    r"""
+put_(index, source, accumulate=False) -> Tensor
+
+Copies the elements from :attr:`source` into the positions specified by
+:attr:`index`. For the purpose of indexing, the :attr:`self` tensor is treated as if
+it were a 1-D tensor.
+
+:attr:`index` and :attr:`source` need to have the same number of elements, but not necessarily
+the same shape.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`source` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if :attr:`index`
+contain duplicate elements.
+
+Args:
+    index (LongTensor): the indices into self
+    source (Tensor): the tensor containing values to copy from
+    accumulate (bool): whether to accumulate into self
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+    ...                     [6, 7, 8]])
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+    tensor([[  4,   9,   5],
+            [ 10,   7,   8]])
+""",
+)
+
+add_docstr_all(
+    "put",
+    r"""
+put(input, index, source, accumulate=False) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.put_`.
+`input` corresponds to `self` in :meth:`torch.Tensor.put_`.
+""",
+)
+
+add_docstr_all(
+    "qr",
+    r"""
+qr(some=True) -> (Tensor, Tensor)
+
+See :func:`torch.qr`
+""",
+)
+
+add_docstr_all(
+    "qscheme",
+    r"""
+qscheme() -> torch.qscheme
+
+Returns the quantization scheme of a given QTensor.
+""",
+)
+
+add_docstr_all(
+    "quantile",
+    r"""
+quantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.quantile`
+""",
+)
+
+add_docstr_all(
+    "nanquantile",
+    r"""
+nanquantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.nanquantile`
+""",
+)
+
+add_docstr_all(
+    "q_scale",
+    r"""
+q_scale() -> float
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the scale of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_zero_point",
+    r"""
+q_zero_point() -> int
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the zero_point of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_scales",
+    r"""
+q_per_channel_scales() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a Tensor of scales of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_zero_points",
+    r"""
+q_per_channel_zero_points() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a tensor of zero_points of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_axis",
+    r"""
+q_per_channel_axis() -> int
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns the index of dimension on which per-channel quantization is applied.
+""",
+)
+
+add_docstr_all(
+    "random_",
+    r"""
+random_(from=0, to=None, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+distribution over ``[from, to - 1]``. If not specified, the values are usually
+only bounded by :attr:`self` tensor's data type. However, for floating point
+types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+will be uniform in ``[0, 2^53]``.
+""",
+)
+
+add_docstr_all(
+    "rad2deg",
+    r"""
+rad2deg() -> Tensor
+
+See :func:`torch.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "rad2deg_",
+    r"""
+rad2deg_() -> Tensor
+
+In-place version of :meth:`~Tensor.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "deg2rad",
+    r"""
+deg2rad() -> Tensor
+
+See :func:`torch.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "deg2rad_",
+    r"""
+deg2rad_() -> Tensor
+
+In-place version of :meth:`~Tensor.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "ravel",
+    r"""
+ravel() -> Tensor
+
+see :func:`torch.ravel`
+""",
+)
+
+add_docstr_all(
+    "reciprocal",
+    r"""
+reciprocal() -> Tensor
+
+See :func:`torch.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "reciprocal_",
+    r"""
+reciprocal_() -> Tensor
+
+In-place version of :meth:`~Tensor.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "record_stream",
+    r"""
+record_stream(stream)
+
+Marks the tensor as having been used by this stream.  When the tensor
+is deallocated, ensure the tensor memory is not reused for another tensor
+until all work queued on :attr:`stream` at the time of deallocation is
+complete.
+
+.. note::
+
+    The caching allocator is aware of only the stream where a tensor was
+    allocated. Due to the awareness, it already correctly manages the life
+    cycle of tensors on only one stream. But if a tensor is used on a stream
+    different from the stream of origin, the allocator might reuse the memory
+    unexpectedly. Calling this method lets the allocator know which streams
+    have used the tensor.
+
+.. warning::
+
+    This method is most suitable for use cases where you are providing a
+    function that created a tensor on a side stream, and want users to be able
+    to make use of the tensor without having to think carefully about stream
+    safety when making use of them.  These safety guarantees come at some
+    performance and predictability cost (analogous to the tradeoff between GC
+    and manual memory management), so if you are in a situation where
+    you manage the full lifetime of your tensors, you may consider instead
+    manually managing CUDA events so that calling this method is not necessary.
+    In particular, when you call this method, on later allocations the
+    allocator will poll the recorded stream to see if all operations have
+    completed yet; you can potentially race with side stream computation and
+    non-deterministically reuse or fail to reuse memory for an allocation.
+
+    You can safely use tensors allocated on side streams without
+    :meth:`~Tensor.record_stream`; you must manually ensure that
+    any non-creation stream uses of a tensor are synced back to the creation
+    stream before you deallocate the tensor.  As the CUDA caching allocator
+    guarantees that the memory will only be reused with the same creation stream,
+    this is sufficient to ensure that writes to future reallocations of the
+    memory will be delayed until non-creation stream uses are done.
+    (Counterintuitively, you may observe that on the CPU side we have already
+    reallocated the tensor, even though CUDA kernels on the old tensor are
+    still in progress.  This is fine, because CUDA operations on the new
+    tensor will appropriately wait for the old operations to complete, as they
+    are all on the same stream.)
+
+    Concretely, this looks like this::
+
+        with torch.cuda.stream(s0):
+            x = torch.zeros(N)
+
+        s1.wait_stream(s0)
+        with torch.cuda.stream(s1):
+            y = some_comm_op(x)
+
+        ... some compute on s0 ...
+
+        # synchronize creation stream s0 to side stream s1
+        # before deallocating x
+        s0.wait_stream(s1)
+        del x
+
+    Note that some discretion is required when deciding when to perform
+    ``s0.wait_stream(s1)``.  In particular, if we were to wait immediately
+    after ``some_comm_op``, there wouldn't be any point in having the side
+    stream; it would be equivalent to have run ``some_comm_op`` on ``s0``.
+    Instead, the synchronization must be placed at some appropriate, later
+    point in time where you expect the side stream ``s1`` to have finished
+    work.  This location is typically identified via profiling, e.g., using
+    Chrome traces produced
+    :meth:`torch.autograd.profiler.profile.export_chrome_trace`.  If you
+    place the wait too early, work on s0 will block until ``s1`` has finished,
+    preventing further overlapping of communication and computation.  If you
+    place the wait too late, you will use more memory than is strictly
+    necessary (as you are keeping ``x`` live for longer.)  For a concrete
+    example of how this guidance can be applied in practice, see this post:
+    `FSDP and CUDACachingAllocator
+    `_.
+""",
+)
+
+add_docstr_all(
+    "remainder",
+    r"""
+remainder(divisor) -> Tensor
+
+See :func:`torch.remainder`
+""",
+)
+
+add_docstr_all(
+    "remainder_",
+    r"""
+remainder_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.remainder`
+""",
+)
+
+add_docstr_all(
+    "renorm",
+    r"""
+renorm(p, dim, maxnorm) -> Tensor
+
+See :func:`torch.renorm`
+""",
+)
+
+add_docstr_all(
+    "renorm_",
+    r"""
+renorm_(p, dim, maxnorm) -> Tensor
+
+In-place version of :meth:`~Tensor.renorm`
+""",
+)
+
+add_docstr_all(
+    "repeat",
+    r"""
+repeat(*sizes) -> Tensor
+
+Repeats this tensor along the specified dimensions.
+
+Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+
+.. warning::
+
+    :meth:`~Tensor.repeat` behaves differently from
+    `numpy.repeat `_,
+    but is more similar to
+    `numpy.tile `_.
+    For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
+
+Args:
+    sizes (torch.Size or int...): The number of times to repeat this tensor along each
+        dimension
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.repeat(4, 2)
+    tensor([[ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3]])
+    >>> x.repeat(4, 2, 1).size()
+    torch.Size([4, 2, 3])
+""",
+)
+
+add_docstr_all(
+    "repeat_interleave",
+    r"""
+repeat_interleave(repeats, dim=None, *, output_size=None) -> Tensor
+
+See :func:`torch.repeat_interleave`.
+""",
+)
+
+add_docstr_all(
+    "requires_grad_",
+    r"""
+requires_grad_(requires_grad=True) -> Tensor
+
+Change if autograd should record operations on this tensor: sets this tensor's
+:attr:`requires_grad` attribute in-place. Returns this tensor.
+
+:func:`requires_grad_`'s main use case is to tell autograd to begin recording
+operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+(because it was obtained through a DataLoader, or required preprocessing or
+initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+begin to record operations on ``tensor``.
+
+Args:
+    requires_grad (bool): If autograd should record operations on this tensor.
+        Default: ``True``.
+
+Example::
+
+    >>> # Let's say we want to preprocess some saved weights and use
+    >>> # the result as new weights.
+    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+    >>> loaded_weights = torch.tensor(saved_weights)
+    >>> weights = preprocess(loaded_weights)  # some function
+    >>> weights
+    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+
+    >>> # Now, start to record operations done to weights
+    >>> weights.requires_grad_()
+    >>> out = weights.pow(2).sum()
+    >>> out.backward()
+    >>> weights.grad
+    tensor([-1.1007,  0.9853, -4.2316, -1.6606])
+
+""",
+)
+
+add_docstr_all(
+    "reshape",
+    r"""
+reshape(*shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`self`
+but with the specified shape. This method returns a view if :attr:`shape` is
+compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+possible to return a view.
+
+See :func:`torch.reshape`
+
+Args:
+    shape (tuple of ints or int...): the desired shape
+
+""",
+)
+
+add_docstr_all(
+    "reshape_as",
+    r"""
+reshape_as(other) -> Tensor
+
+Returns this tensor as the same shape as :attr:`other`.
+``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+This method returns a view if ``other.sizes()`` is compatible with the current
+shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
+Please see :meth:`reshape` for more information about ``reshape``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same shape
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "resize_",
+    r"""
+resize_(*sizes, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes :attr:`self` tensor to the specified size. If the number of elements is
+larger than the current storage size, then the underlying storage is resized
+to fit the new number of elements. If the number of elements is smaller, the
+underlying storage is not changed. Existing elements are preserved but any new
+memory is uninitialized.
+
+.. warning::
+
+    This is a low-level method. The storage is reinterpreted as C-contiguous,
+    ignoring the current strides (unless the target size equals the current
+    size, in which case the tensor is left unchanged). For most purposes, you
+    will instead want to use :meth:`~Tensor.view()`, which checks for
+    contiguity, or :meth:`~Tensor.reshape()`, which copies data if needed. To
+    change the size in-place with custom strides, see :meth:`~Tensor.set_()`.
+
+.. note::
+
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, new elements are initialized to prevent nondeterministic behavior
+    from using the result as an input to an operation. Floating point and
+    complex values are set to NaN, and integer values are set to the maximum
+    value.
+
+Args:
+    sizes (torch.Size or int...): the desired size
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``sizes``.
+
+Example::
+
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x.resize_(2, 2)
+    tensor([[ 1,  2],
+            [ 3,  4]])
+""",
+)
+
+add_docstr_all(
+    "resize_as_",
+    r"""
+resize_as_(tensor, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes the :attr:`self` tensor to be the same size as the specified
+:attr:`tensor`. This is equivalent to ``self.resize_(tensor.size())``.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``tensor.size()``.
+
+""",
+)
+
+add_docstr_all(
+    "rot90",
+    r"""
+rot90(k, dims) -> Tensor
+
+See :func:`torch.rot90`
+""",
+)
+
+add_docstr_all(
+    "round",
+    r"""
+round(decimals=0) -> Tensor
+
+See :func:`torch.round`
+""",
+)
+
+add_docstr_all(
+    "round_",
+    r"""
+round_(decimals=0) -> Tensor
+
+In-place version of :meth:`~Tensor.round`
+""",
+)
+
+add_docstr_all(
+    "rsqrt",
+    r"""
+rsqrt() -> Tensor
+
+See :func:`torch.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "rsqrt_",
+    r"""
+rsqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "scatter_",
+    r"""
+scatter_(dim, index, src, *, reduce=None) -> Tensor
+
+Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive.
+
+.. warning::
+
+    When indices are not unique, the behavior is non-deterministic (one of the
+    values from ``src`` will be picked arbitrarily) and the gradient will be
+    incorrect (it will be propagated to all locations in the source that
+    correspond to the same index)!
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Additionally accepts an optional :attr:`reduce` argument that allows
+specification of an optional reduction operation, which is applied to all
+values in the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+operation is applied to an index in :attr:`self` which is specified by
+its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+value in :attr:`index` for ``dimension = dim``.
+
+Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+is updated as::
+
+    self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+
+Reducing with the addition operation is the same as using
+:meth:`~torch.Tensor.scatter_add_`.
+
+.. warning::
+    The reduce argument with Tensor ``src`` is deprecated and will be removed in
+    a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+    instead for more reduction options.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    src (Tensor): the source element(s) to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> src = torch.arange(1, 11).reshape((2, 5))
+    >>> src
+    tensor([[ 1,  2,  3,  4,  5],
+            [ 6,  7,  8,  9, 10]])
+    >>> index = torch.tensor([[0, 1, 2, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+    tensor([[1, 0, 0, 4, 0],
+            [0, 2, 0, 0, 0],
+            [0, 0, 3, 0, 0]])
+    >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+    tensor([[1, 2, 3, 0, 0],
+            [6, 7, 0, 0, 8],
+            [0, 0, 0, 0, 0]])
+
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='multiply')
+    tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+            [2.0000, 2.0000, 2.0000, 2.4600]])
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='add')
+    tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+            [2.0000, 2.0000, 2.0000, 3.2300]])
+
+.. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+   :noindex:
+
+Writes the value from :attr:`value` into :attr:`self` at the indices
+specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+with the :attr:`src` tensor filled entirely with :attr:`value`.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    value (Scalar): the value to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> index = torch.tensor([[0, 1]])
+    >>> value = 2
+    >>> torch.zeros(3, 5).scatter_(0, index, value)
+    tensor([[2., 0., 0., 0., 0.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 0., 0., 0.]])
+""",
+)
+
+add_docstr_all(
+    "scatter_add_",
+    r"""
+scatter_add_(dim, index, src) -> Tensor
+
+Adds all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`src`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`src`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`src` should have same number of
+dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``. Note that ``index`` and ``src`` do not broadcast.
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add, can be
+        either empty or of the same dimensionality as ``src``. When empty, the
+        operation returns ``self`` unchanged.
+    src (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> src = torch.ones((2, 5))
+    >>> index = torch.tensor([[0, 1, 2, 0, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[1., 0., 0., 1., 1.],
+            [0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 0.]])
+    >>> index = torch.tensor([[0, 1, 2, 0, 0], [0, 1, 2, 2, 2]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[2., 0., 0., 1., 1.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 2., 1., 1.]])
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "scatter_reduce_",
+    r"""
+scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Reduces all values from the :attr:`src` tensor to the indices specified in
+the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction
+defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`,
+:obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an
+index in :attr:`self` which is specified by its index in :attr:`src` for
+``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self`
+tensor are included in the reduction.
+
+:attr:`self`, :attr:`index` and :attr:`src` should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and reduce.
+    src (Tensor): the source elements to scatter and reduce
+    reduce (str): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+    include_self (bool): whether elements from the :attr:`self` tensor are
+        included in the reduction
+
+Example::
+
+    >>> src = torch.tensor([1., 2., 3., 4., 5., 6.])
+    >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+    >>> input = torch.tensor([1., 2., 3., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum")
+    tensor([5., 14., 8., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False)
+    tensor([4., 12., 5., 4.])
+    >>> input2 = torch.tensor([5., 4., 3., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax")
+    tensor([5., 6., 5., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False)
+    tensor([3., 6., 5., 2.])
+
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "select",
+    r"""
+select(dim, index) -> Tensor
+
+See :func:`torch.select`
+""",
+)
+
+add_docstr_all(
+    "select_scatter",
+    r"""
+select_scatter(src, dim, index) -> Tensor
+
+See :func:`torch.select_scatter`
+""",
+)
+
+add_docstr_all(
+    "slice_scatter",
+    r"""
+slice_scatter(src, dim=0, start=None, end=None, step=1) -> Tensor
+
+See :func:`torch.slice_scatter`
+""",
+)
+
+add_docstr_all(
+    "set_",
+    r"""
+set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+
+Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+:attr:`self` tensor will share the same storage and have the same size and
+strides as :attr:`source`. Changes to elements in one tensor will be reflected
+in the other.
+
+If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+storage, offset, size, and stride.
+
+Args:
+    source (Tensor or Storage): the tensor or storage to use
+    storage_offset (int, optional): the offset in the storage
+    size (torch.Size, optional): the desired size. Defaults to the size of the source.
+    stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+""",
+)
+
+add_docstr_all(
+    "sigmoid",
+    r"""
+sigmoid() -> Tensor
+
+See :func:`torch.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "sigmoid_",
+    r"""
+sigmoid_() -> Tensor
+
+In-place version of :meth:`~Tensor.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "logit",
+    r"""
+logit() -> Tensor
+
+See :func:`torch.logit`
+""",
+)
+
+add_docstr_all(
+    "logit_",
+    r"""
+logit_() -> Tensor
+
+In-place version of :meth:`~Tensor.logit`
+""",
+)
+
+add_docstr_all(
+    "sign",
+    r"""
+sign() -> Tensor
+
+See :func:`torch.sign`
+""",
+)
+
+add_docstr_all(
+    "sign_",
+    r"""
+sign_() -> Tensor
+
+In-place version of :meth:`~Tensor.sign`
+""",
+)
+
+add_docstr_all(
+    "signbit",
+    r"""
+signbit() -> Tensor
+
+See :func:`torch.signbit`
+""",
+)
+
+add_docstr_all(
+    "sgn",
+    r"""
+sgn() -> Tensor
+
+See :func:`torch.sgn`
+""",
+)
+
+add_docstr_all(
+    "sgn_",
+    r"""
+sgn_() -> Tensor
+
+In-place version of :meth:`~Tensor.sgn`
+""",
+)
+
+add_docstr_all(
+    "sin",
+    r"""
+sin() -> Tensor
+
+See :func:`torch.sin`
+""",
+)
+
+add_docstr_all(
+    "sin_",
+    r"""
+sin_() -> Tensor
+
+In-place version of :meth:`~Tensor.sin`
+""",
+)
+
+add_docstr_all(
+    "sinc",
+    r"""
+sinc() -> Tensor
+
+See :func:`torch.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinc_",
+    r"""
+sinc_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinh",
+    r"""
+sinh() -> Tensor
+
+See :func:`torch.sinh`
+""",
+)
+
+add_docstr_all(
+    "sinh_",
+    r"""
+sinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinh`
+""",
+)
+
+add_docstr_all(
+    "size",
+    r"""
+size(dim=None) -> torch.Size or int
+
+Returns the size of the :attr:`self` tensor. If ``dim`` is not specified,
+the returned value is a :class:`torch.Size`, a subclass of :class:`tuple`.
+If ``dim`` is specified, returns an int holding the size of that dimension.
+
+Args:
+  dim (int, optional): The dimension for which to retrieve the size.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.size(dim=1)
+    4
+
+""",
+)
+
+add_docstr_all(
+    "shape",
+    r"""
+shape() -> torch.Size
+
+Returns the size of the :attr:`self` tensor. Alias for :attr:`size`.
+
+See also :meth:`Tensor.size`.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.shape
+    torch.Size([3, 4, 5])
+
+""",
+)
+
+add_docstr_all(
+    "sort",
+    r"""
+sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+
+See :func:`torch.sort`
+""",
+)
+
+add_docstr_all(
+    "msort",
+    r"""
+msort() -> Tensor
+
+See :func:`torch.msort`
+""",
+)
+
+add_docstr_all(
+    "argsort",
+    r"""
+argsort(dim=-1, descending=False) -> LongTensor
+
+See :func:`torch.argsort`
+""",
+)
+
+add_docstr_all(
+    "sparse_dim",
+    r"""
+sparse_dim() -> int
+
+Return the number of sparse dimensions in a :ref:`sparse tensor ` :attr:`self`.
+
+.. note::
+  Returns ``0`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.dense_dim` and :ref:`hybrid tensors `.
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_",
+    r"""
+sparse_resize_(size, sparse_dim, dense_dim) -> Tensor
+
+Resizes :attr:`self` :ref:`sparse tensor ` to the desired
+size and the number of sparse and dense dimensions.
+
+.. note::
+  If the number of specified elements in :attr:`self` is zero, then
+  :attr:`size`, :attr:`sparse_dim`, and :attr:`dense_dim` can be any
+  size and positive integers such that ``len(size) == sparse_dim +
+  dense_dim``.
+
+  If :attr:`self` specifies one or more elements, however, then each
+  dimension in :attr:`size` must not be smaller than the corresponding
+  dimension of :attr:`self`, :attr:`sparse_dim` must equal the number
+  of sparse dimensions in :attr:`self`, and :attr:`dense_dim` must
+  equal the number of dense dimensions in :attr:`self`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size. If :attr:`self` is non-empty
+      sparse tensor, the desired size cannot be smaller than the
+      original size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_and_clear_",
+    r"""
+sparse_resize_and_clear_(size, sparse_dim, dense_dim) -> Tensor
+
+Removes all specified elements from a :ref:`sparse tensor
+` :attr:`self` and resizes :attr:`self` to the desired
+size and the number of sparse and dense dimensions.
+
+.. warning:
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sqrt",
+    r"""
+sqrt() -> Tensor
+
+See :func:`torch.sqrt`
+""",
+)
+
+add_docstr_all(
+    "sqrt_",
+    r"""
+sqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.sqrt`
+""",
+)
+
+add_docstr_all(
+    "square",
+    r"""
+square() -> Tensor
+
+See :func:`torch.square`
+""",
+)
+
+add_docstr_all(
+    "square_",
+    r"""
+square_() -> Tensor
+
+In-place version of :meth:`~Tensor.square`
+""",
+)
+
+add_docstr_all(
+    "squeeze",
+    r"""
+squeeze(dim=None) -> Tensor
+
+See :func:`torch.squeeze`
+""",
+)
+
+add_docstr_all(
+    "squeeze_",
+    r"""
+squeeze_(dim=None) -> Tensor
+
+In-place version of :meth:`~Tensor.squeeze`
+""",
+)
+
+add_docstr_all(
+    "std",
+    r"""
+std(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.std`
+""",
+)
+
+add_docstr_all(
+    "storage_offset",
+    r"""
+storage_offset() -> int
+
+Returns :attr:`self` tensor's offset in the underlying storage in terms of
+number of storage elements (not bytes).
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
+    >>> x.storage_offset()
+    0
+    >>> x[3:].storage_offset()
+    3
+
+""",
+)
+
+add_docstr_all(
+    "untyped_storage",
+    r"""
+untyped_storage() -> torch.UntypedStorage
+
+Returns the underlying :class:`UntypedStorage`.
+""",
+)
+
+add_docstr_all(
+    "stride",
+    r"""
+stride(dim) -> tuple or int
+
+Returns the stride of :attr:`self` tensor.
+
+Stride is the jump necessary to go from one element to the next one in the
+specified dimension :attr:`dim`. A tuple of all strides is returned when no
+argument is passed in. Otherwise, an integer value is returned as the stride in
+the particular dimension :attr:`dim`.
+
+Args:
+    dim (int, optional): the desired dimension in which stride is required
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+    >>> x.stride(0)
+    5
+    >>> x.stride(-1)
+    1
+
+""",
+)
+
+add_docstr_all(
+    "sub",
+    r"""
+sub(other, *, alpha=1) -> Tensor
+
+See :func:`torch.sub`.
+""",
+)
+
+add_docstr_all(
+    "sub_",
+    r"""
+sub_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.sub`
+""",
+)
+
+add_docstr_all(
+    "subtract",
+    r"""
+subtract(other, *, alpha=1) -> Tensor
+
+See :func:`torch.subtract`.
+""",
+)
+
+add_docstr_all(
+    "subtract_",
+    r"""
+subtract_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.subtract`.
+""",
+)
+
+add_docstr_all(
+    "sum",
+    r"""
+sum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.sum`
+""",
+)
+
+add_docstr_all(
+    "nansum",
+    r"""
+nansum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.nansum`
+""",
+)
+
+add_docstr_all(
+    "svd",
+    r"""
+svd(some=True, compute_uv=True) -> (Tensor, Tensor, Tensor)
+
+See :func:`torch.svd`
+""",
+)
+
+add_docstr_all(
+    "swapdims",
+    r"""
+swapdims(dim0, dim1) -> Tensor
+
+See :func:`torch.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapdims_",
+    r"""
+swapdims_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapaxes",
+    r"""
+swapaxes(axis0, axis1) -> Tensor
+
+See :func:`torch.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "swapaxes_",
+    r"""
+swapaxes_(axis0, axis1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "t",
+    r"""
+t() -> Tensor
+
+See :func:`torch.t`
+""",
+)
+
+add_docstr_all(
+    "t_",
+    r"""
+t_() -> Tensor
+
+In-place version of :meth:`~Tensor.t`
+""",
+)
+
+add_docstr_all(
+    "tile",
+    r"""
+tile(dims) -> Tensor
+
+See :func:`torch.tile`
+""",
+)
+
+add_docstr_all(
+    "to",
+    r"""
+to(*args, **kwargs) -> Tensor
+
+Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+inferred from the arguments of ``self.to(*args, **kwargs)``.
+
+.. note::
+
+    If the ``self`` Tensor already
+    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+    Otherwise, the returned tensor is a copy of ``self`` with the desired
+    :class:`torch.dtype` and :class:`torch.device`.
+
+Here are the ways to call ``to``:
+
+.. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`dtype`
+
+    Args:
+        {memory_format}
+
+.. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`device` and (optional)
+    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+    When :attr:`non_blocking`, tries to convert asynchronously with respect to
+    the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+    CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+    Args:
+        {memory_format}
+
+.. method:: to(other, non_blocking=False, copy=False) -> Tensor
+   :noindex:
+
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+    the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+    asynchronously with respect to the host if possible, e.g., converting a CPU
+    Tensor with pinned memory to a CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+Example::
+
+    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+    >>> tensor.to(torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64)
+
+    >>> cuda0 = torch.device('cuda:0')
+    >>> tensor.to(cuda0)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], device='cuda:0')
+
+    >>> tensor.to(cuda0, dtype=torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+    >>> tensor.to(other, non_blocking=True)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "byte",
+    r"""
+byte(memory_format=torch.preserve_format) -> Tensor
+
+``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bool",
+    r"""
+bool(memory_format=torch.preserve_format) -> Tensor
+
+``self.bool()`` is equivalent to ``self.to(torch.bool)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "char",
+    r"""
+char(memory_format=torch.preserve_format) -> Tensor
+
+``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bfloat16",
+    r"""
+bfloat16(memory_format=torch.preserve_format) -> Tensor
+``self.bfloat16()`` is equivalent to ``self.to(torch.bfloat16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "double",
+    r"""
+double(memory_format=torch.preserve_format) -> Tensor
+
+``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "float",
+    r"""
+float(memory_format=torch.preserve_format) -> Tensor
+
+``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cdouble",
+    r"""
+cdouble(memory_format=torch.preserve_format) -> Tensor
+
+``self.cdouble()`` is equivalent to ``self.to(torch.complex128)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cfloat",
+    r"""
+cfloat(memory_format=torch.preserve_format) -> Tensor
+
+``self.cfloat()`` is equivalent to ``self.to(torch.complex64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "chalf",
+    r"""
+chalf(memory_format=torch.preserve_format) -> Tensor
+
+``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`.
+
+Args:
+     {memory_format}
+ """.format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "half",
+    r"""
+half(memory_format=torch.preserve_format) -> Tensor
+
+``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int",
+    r"""
+int(memory_format=torch.preserve_format) -> Tensor
+
+``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int_repr",
+    r"""
+int_repr() -> Tensor
+
+Given a quantized Tensor,
+``self.int_repr()`` returns a CPU Tensor with uint8_t as data type that stores the
+underlying uint8_t values of the given Tensor.
+""",
+)
+
+
+add_docstr_all(
+    "long",
+    r"""
+long(memory_format=torch.preserve_format) -> Tensor
+
+``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "short",
+    r"""
+short(memory_format=torch.preserve_format) -> Tensor
+
+``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "take",
+    r"""
+take(indices) -> Tensor
+
+See :func:`torch.take`
+""",
+)
+
+add_docstr_all(
+    "take_along_dim",
+    r"""
+take_along_dim(indices, dim) -> Tensor
+
+See :func:`torch.take_along_dim`
+""",
+)
+
+add_docstr_all(
+    "tan",
+    r"""
+tan() -> Tensor
+
+See :func:`torch.tan`
+""",
+)
+
+add_docstr_all(
+    "tan_",
+    r"""
+tan_() -> Tensor
+
+In-place version of :meth:`~Tensor.tan`
+""",
+)
+
+add_docstr_all(
+    "tanh",
+    r"""
+tanh() -> Tensor
+
+See :func:`torch.tanh`
+""",
+)
+
+add_docstr_all(
+    "softmax",
+    r"""
+softmax(dim) -> Tensor
+
+Alias for :func:`torch.nn.functional.softmax`.
+""",
+)
+
+add_docstr_all(
+    "tanh_",
+    r"""
+tanh_() -> Tensor
+
+In-place version of :meth:`~Tensor.tanh`
+""",
+)
+
+add_docstr_all(
+    "tolist",
+    r"""
+tolist() -> list or number
+
+Returns the tensor as a (nested) list. For scalars, a standard
+Python number is returned, just like with :meth:`~Tensor.item`.
+Tensors are automatically moved to the CPU first if necessary.
+
+This operation is not differentiable.
+
+Examples::
+
+    >>> a = torch.randn(2, 2)
+    >>> a.tolist()
+    [[0.012766935862600803, 0.5415473580360413],
+     [-0.08909505605697632, 0.7729271650314331]]
+    >>> a[0,0].tolist()
+    0.012766935862600803
+""",
+)
+
+add_docstr_all(
+    "topk",
+    r"""
+topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
+
+See :func:`torch.topk`
+""",
+)
+
+add_docstr_all(
+    "to_dense",
+    r"""
+to_dense(dtype=None, *, masked_grad=True) -> Tensor
+
+Creates a strided copy of :attr:`self` if :attr:`self` is not a strided tensor, otherwise returns :attr:`self`.
+
+Keyword args:
+    {dtype}
+    masked_grad (bool, optional): If set to ``True`` (default) and
+      :attr:`self` has a sparse layout then the backward of
+      :meth:`to_dense` returns ``grad.sparse_mask(self)``.
+
+Example::
+
+    >>> s = torch.sparse_coo_tensor(
+    ...        torch.tensor([[1, 1],
+    ...                      [0, 2]]),
+    ...        torch.tensor([9, 10]),
+    ...        size=(3, 3))
+    >>> s.to_dense()
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+""",
+)
+
+add_docstr_all(
+    "to_sparse",
+    r"""
+to_sparse(sparseDims) -> Tensor
+
+Returns a sparse copy of the tensor.  PyTorch supports sparse tensors in
+:ref:`coordinate format `.
+
+Args:
+    sparseDims (int, optional): the number of sparse dimensions to include in the new sparse tensor
+
+Example::
+
+    >>> d = torch.tensor([[0, 0, 0], [9, 0, 10], [0, 0, 0]])
+    >>> d
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+    >>> d.to_sparse()
+    tensor(indices=tensor([[1, 1],
+                           [0, 2]]),
+           values=tensor([ 9, 10]),
+           size=(3, 3), nnz=2, layout=torch.sparse_coo)
+    >>> d.to_sparse(1)
+    tensor(indices=tensor([[1]]),
+           values=tensor([[ 9,  0, 10]]),
+           size=(3, 3), nnz=1, layout=torch.sparse_coo)
+
+.. method:: to_sparse(*, layout=None, blocksize=None, dense_dim=None) -> Tensor
+   :noindex:
+
+Returns a sparse tensor with the specified layout and blocksize.  If
+the :attr:`self` is strided, the number of dense dimensions could be
+specified, and a hybrid sparse tensor will be created, with
+`dense_dim` dense dimensions and `self.dim() - 2 - dense_dim` batch
+dimension.
+
+.. note:: If the :attr:`self` layout and blocksize parameters match
+          with the specified layout and blocksize, return
+          :attr:`self`. Otherwise, return a sparse tensor copy of
+          :attr:`self`.
+
+Args:
+
+    layout (:class:`torch.layout`, optional): The desired sparse
+      layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+      ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+      ``torch.sparse_bsc``. Default: if ``None``,
+      ``torch.sparse_coo``.
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR or BSC tensor. For other layouts,
+      specifying the block size that is not ``None`` will result in a
+      RuntimeError exception.  A block size must be a tuple of length
+      two such that its items evenly divide the two sparse dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR, CSC, BSR or BSC tensor.  This argument should be
+      used only if :attr:`self` is a strided tensor, and must be a
+      value between 0 and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+    >>> x.to_sparse(layout=torch.sparse_coo)
+    tensor(indices=tensor([[0, 2, 2],
+                           [0, 0, 1]]),
+           values=tensor([1, 2, 3]),
+           size=(3, 2), nnz=3, layout=torch.sparse_coo)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+    tensor(crow_indices=tensor([0, 1, 1, 2]),
+           col_indices=tensor([0, 0]),
+           values=tensor([[[1, 0]],
+                          [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+    RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+    >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+    RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
+
+    >>> x = torch.tensor([[[1], [0]], [[0], [0]], [[2], [3]]])
+    >>> x.to_sparse(layout=torch.sparse_csr, dense_dim=1)
+    tensor(crow_indices=tensor([0, 1, 1, 3]),
+           col_indices=tensor([0, 0, 1]),
+           values=tensor([[1],
+                          [2],
+                          [3]]), size=(3, 2, 1), nnz=3, layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csr",
+    r"""
+to_sparse_csr(dense_dim=None) -> Tensor
+
+Convert a tensor to compressed row storage format (CSR).  Except for
+strided tensors, only works with 2D tensors.  If the :attr:`self` is
+strided, then the number of dense dimensions could be specified, and a
+hybrid CSR tensor will be created, with `dense_dim` dense dimensions
+and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csr(dense_dim=2)
+    tensor(crow_indices=tensor([0, 1, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csc",
+    r"""
+to_sparse_csc() -> Tensor
+
+Convert a tensor to compressed column storage (CSC) format.  Except
+for strided tensors, only works with 2D tensors.  If the :attr:`self`
+is strided, then the number of dense dimensions could be specified,
+and a hybrid CSC tensor will be created, with `dense_dim` dense
+dimensions and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csc()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csc(dense_dim=2)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csc)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsr",
+    r"""
+to_sparse_bsr(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse row (BSR) storage format of given
+blocksize.  If the :attr:`self` is strided, then the number of dense
+dimensions could be specified, and a hybrid BSR tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsr = sparse.to_sparse_bsr((5, 5))
+    >>> sparse_bsr.col_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsr((2, 1), 1)
+    tensor(crow_indices=tensor([0, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsc",
+    r"""
+to_sparse_bsc(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse column (BSC) storage format of
+given blocksize.  If the :attr:`self` is strided, then the number of
+dense dimensions could be specified, and a hybrid BSC tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSC tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsc = sparse.to_sparse_bsc((5, 5))
+    >>> sparse_bsc.row_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsc((2, 1), 1)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 1, 0]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsc)
+
+""",
+)
+
+add_docstr_all(
+    "to_mkldnn",
+    r"""
+to_mkldnn() -> Tensor
+Returns a copy of the tensor in ``torch.mkldnn`` layout.
+
+""",
+)
+
+add_docstr_all(
+    "trace",
+    r"""
+trace() -> Tensor
+
+See :func:`torch.trace`
+""",
+)
+
+add_docstr_all(
+    "transpose",
+    r"""
+transpose(dim0, dim1) -> Tensor
+
+See :func:`torch.transpose`
+""",
+)
+
+add_docstr_all(
+    "transpose_",
+    r"""
+transpose_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.transpose`
+""",
+)
+
+add_docstr_all(
+    "triangular_solve",
+    r"""
+triangular_solve(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+
+See :func:`torch.triangular_solve`
+""",
+)
+
+add_docstr_all(
+    "tril",
+    r"""
+tril(diagonal=0) -> Tensor
+
+See :func:`torch.tril`
+""",
+)
+
+add_docstr_all(
+    "tril_",
+    r"""
+tril_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.tril`
+""",
+)
+
+add_docstr_all(
+    "triu",
+    r"""
+triu(diagonal=0) -> Tensor
+
+See :func:`torch.triu`
+""",
+)
+
+add_docstr_all(
+    "triu_",
+    r"""
+triu_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.triu`
+""",
+)
+
+add_docstr_all(
+    "true_divide",
+    r"""
+true_divide(value) -> Tensor
+
+See :func:`torch.true_divide`
+""",
+)
+
+add_docstr_all(
+    "true_divide_",
+    r"""
+true_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.true_divide_`
+""",
+)
+
+add_docstr_all(
+    "trunc",
+    r"""
+trunc() -> Tensor
+
+See :func:`torch.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix",
+    r"""
+fix() -> Tensor
+
+See :func:`torch.fix`.
+""",
+)
+
+add_docstr_all(
+    "trunc_",
+    r"""
+trunc_() -> Tensor
+
+In-place version of :meth:`~Tensor.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix_",
+    r"""
+fix_() -> Tensor
+
+In-place version of :meth:`~Tensor.fix`
+""",
+)
+
+add_docstr_all(
+    "type",
+    r"""
+type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+Returns the type if `dtype` is not provided, else casts this object to
+the specified type.
+
+If this is already of the correct type, no copy is performed and the
+original object is returned.
+
+Args:
+    dtype (dtype or string): The desired type
+    non_blocking (bool): If ``True``, and the source is in pinned memory
+        and destination is on the GPU or vice versa, the copy is performed
+        asynchronously with respect to the host. Otherwise, the argument
+        has no effect.
+    **kwargs: For compatibility, may contain the key ``async`` in place of
+        the ``non_blocking`` argument. The ``async`` arg is deprecated.
+""",
+)
+
+add_docstr_all(
+    "type_as",
+    r"""
+type_as(tensor) -> Tensor
+
+Returns this tensor cast to the type of the given tensor.
+
+This is a no-op if the tensor is already of the correct type. This is
+equivalent to ``self.type(tensor.type())``
+
+Args:
+    tensor (Tensor): the tensor which has the desired type
+""",
+)
+
+add_docstr_all(
+    "unfold",
+    r"""
+unfold(dimension, size, step) -> Tensor
+
+Returns a view of the original tensor which contains all slices of size :attr:`size` from
+:attr:`self` tensor in the dimension :attr:`dimension`.
+
+Step between two slices is given by :attr:`step`.
+
+If `sizedim` is the size of dimension :attr:`dimension` for :attr:`self`, the size of
+dimension :attr:`dimension` in the returned tensor will be
+`(sizedim - size) / step + 1`.
+
+An additional dimension of size :attr:`size` is appended in the returned tensor.
+
+Args:
+    dimension (int): dimension in which unfolding happens
+    size (int): the size of each slice that is unfolded
+    step (int): the step between each slice
+
+Example::
+
+    >>> x = torch.arange(1., 8)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
+    >>> x.unfold(0, 2, 1)
+    tensor([[ 1.,  2.],
+            [ 2.,  3.],
+            [ 3.,  4.],
+            [ 4.,  5.],
+            [ 5.,  6.],
+            [ 6.,  7.]])
+    >>> x.unfold(0, 2, 2)
+    tensor([[ 1.,  2.],
+            [ 3.,  4.],
+            [ 5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "uniform_",
+    r"""
+uniform_(from=0, to=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the continuous uniform
+distribution:
+
+.. math::
+    f(x) = \dfrac{1}{\text{to} - \text{from}}
+""",
+)
+
+add_docstr_all(
+    "unsqueeze",
+    r"""
+unsqueeze(dim) -> Tensor
+
+See :func:`torch.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "unsqueeze_",
+    r"""
+unsqueeze_(dim) -> Tensor
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "var",
+    r"""
+var(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.var`
+""",
+)
+
+add_docstr_all(
+    "vdot",
+    r"""
+vdot(other) -> Tensor
+
+See :func:`torch.vdot`
+""",
+)
+
+add_docstr_all(
+    "view",
+    r"""
+view(*shape) -> Tensor
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`shape`.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. For a tensor to be viewed, the new
+view size must be compatible with its original size and stride, i.e., each new
+view dimension must either be a subspace of an original dimension, or only span
+across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+
+.. math::
+
+  \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+
+Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+:meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+returns a view if the shapes are compatible, and copies (equivalent to calling
+:meth:`contiguous`) otherwise.
+
+Args:
+    shape (torch.Size or int...): the desired size
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+
+    >>> a = torch.randn(1, 2, 3, 4)
+    >>> a.size()
+    torch.Size([1, 2, 3, 4])
+    >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+    >>> b.size()
+    torch.Size([1, 3, 2, 4])
+    >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+    >>> c.size()
+    torch.Size([1, 3, 2, 4])
+    >>> torch.equal(b, c)
+    False
+
+
+.. method:: view(dtype) -> Tensor
+   :noindex:
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`dtype`.
+
+If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+then the size of the last dimension of the output will be scaled
+proportionally.  For instance, if :attr:`dtype` element size is twice that of
+``self.dtype``, then each pair of elements in the last dimension of
+:attr:`self` will be combined, and the size of the last dimension of the output
+will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+be split in two, and the size of the last dimension of the output will be
+double that of :attr:`self`. For this to be possible, the following conditions
+must be true:
+
+    * ``self.dim()`` must be greater than 0.
+    * ``self.stride(-1)`` must be 1.
+
+Additionally, if the element size of :attr:`dtype` is greater than that of
+``self.dtype``, the following conditions must be true as well:
+
+    * ``self.size(-1)`` must be divisible by the ratio between the element
+      sizes of the dtypes.
+    * ``self.storage_offset()`` must be divisible by the ratio between the
+      element sizes of the dtypes.
+    * The strides of all dimensions, except the last dimension, must be
+      divisible by the ratio between the element sizes of the dtypes.
+
+If any of the above conditions are not met, an error is thrown.
+
+.. warning::
+
+    This overload is not supported by TorchScript, and using it in a Torchscript
+    program will cause undefined behavior.
+
+
+Args:
+    dtype (:class:`torch.dtype`): the desired dtype
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x
+    tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+    >>> x.dtype
+    torch.float32
+
+    >>> y = x.view(torch.int32)
+    >>> y
+    tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+            [-1105482831,  1061112040,  1057999968, -1084397505],
+            [-1071760287, -1123489973, -1097310419, -1084649136],
+            [-1101533110,  1073668768, -1082790149, -1088634448]],
+        dtype=torch.int32)
+    >>> y[0, 0] = 1000000000
+    >>> x
+    tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+
+    >>> x.view(torch.cfloat)
+    tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+            [-0.1520+0.7472j,  0.5617-0.8649j],
+            [-2.4724-0.0334j, -0.2976-0.8499j],
+            [-0.2109+1.9913j, -0.9607-0.6123j]])
+    >>> x.view(torch.cfloat).size()
+    torch.Size([4, 2])
+
+    >>> x.view(torch.uint8)
+    tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+               8, 191],
+            [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+              93, 191],
+            [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+              89, 191],
+            [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+              28, 191]], dtype=torch.uint8)
+    >>> x.view(torch.uint8).size()
+    torch.Size([4, 16])
+""",
+)
+
+add_docstr_all(
+    "view_as",
+    r"""
+view_as(other) -> Tensor
+
+View this tensor as the same size as :attr:`other`.
+``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+
+Please see :meth:`~Tensor.view` for more information about ``view``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "expand",
+    r"""
+expand(*sizes) -> Tensor
+
+Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+to a larger size.
+
+Passing -1 as the size for a dimension means not changing the size of
+that dimension.
+
+Tensor can be also expanded to a larger number of dimensions, and the
+new ones will be appended at the front. For the new dimensions, the
+size cannot be set to -1.
+
+Expanding a tensor does not allocate new memory, but only creates a
+new view on the existing tensor where a dimension of size one is
+expanded to a larger size by setting the ``stride`` to 0. Any dimension
+of size 1 can be expanded to an arbitrary value without allocating new
+memory.
+
+Args:
+    *sizes (torch.Size or int...): the desired expanded size
+
+.. warning::
+
+    More than one element of an expanded tensor may refer to a single
+    memory location. As a result, in-place operations (especially ones that
+    are vectorized) may result in incorrect behavior. If you need to write
+    to the tensors, please clone them first.
+
+Example::
+
+    >>> x = torch.tensor([[1], [2], [3]])
+    >>> x.size()
+    torch.Size([3, 1])
+    >>> x.expand(3, 4)
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+""",
+)
+
+add_docstr_all(
+    "expand_as",
+    r"""
+expand_as(other) -> Tensor
+
+Expand this tensor to the same size as :attr:`other`.
+``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+
+Please see :meth:`~Tensor.expand` for more information about ``expand``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "sum_to_size",
+    r"""
+sum_to_size(*size) -> Tensor
+
+Sum ``this`` tensor to :attr:`size`.
+:attr:`size` must be broadcastable to ``this`` tensor size.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+""",
+)
+
+
+add_docstr_all(
+    "zero_",
+    r"""
+zero_() -> Tensor
+
+Fills :attr:`self` tensor with zeros.
+""",
+)
+
+add_docstr_all(
+    "matmul",
+    r"""
+matmul(tensor2) -> Tensor
+
+See :func:`torch.matmul`
+""",
+)
+
+add_docstr_all(
+    "chunk",
+    r"""
+chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_chunk",
+    r"""
+unsafe_chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_split",
+    r"""
+unsafe_split(split_size, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_split`
+""",
+)
+
+add_docstr_all(
+    "tensor_split",
+    r"""
+tensor_split(indices_or_sections, dim=0) -> List of Tensors
+
+See :func:`torch.tensor_split`
+""",
+)
+
+add_docstr_all(
+    "hsplit",
+    r"""
+hsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.hsplit`
+""",
+)
+
+add_docstr_all(
+    "vsplit",
+    r"""
+vsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.vsplit`
+""",
+)
+
+add_docstr_all(
+    "dsplit",
+    r"""
+dsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.dsplit`
+""",
+)
+
+add_docstr_all(
+    "stft",
+    r"""
+stft(frame_length, hop, fft_size=None, return_onesided=True, window=None, pad_end=0) -> Tensor
+
+See :func:`torch.stft`
+""",
+)
+
+add_docstr_all(
+    "istft",
+    r"""
+istft(n_fft, hop_length=None, win_length=None, window=None,
+ center=True, normalized=False, onesided=True, length=None) -> Tensor
+
+See :func:`torch.istft`
+""",
+)
+
+add_docstr_all(
+    "det",
+    r"""
+det() -> Tensor
+
+See :func:`torch.det`
+""",
+)
+
+add_docstr_all(
+    "where",
+    r"""
+where(condition, y) -> Tensor
+
+``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+See :func:`torch.where`
+""",
+)
+
+add_docstr_all(
+    "logdet",
+    r"""
+logdet() -> Tensor
+
+See :func:`torch.logdet`
+""",
+)
+
+add_docstr_all(
+    "slogdet",
+    r"""
+slogdet() -> (Tensor, Tensor)
+
+See :func:`torch.slogdet`
+""",
+)
+
+add_docstr_all(
+    "unbind",
+    r"""
+unbind(dim=0) -> seq
+
+See :func:`torch.unbind`
+""",
+)
+
+add_docstr_all(
+    "pin_memory",
+    r"""
+pin_memory() -> Tensor
+
+Copies the tensor to pinned memory, if it's not already pinned.
+""",
+)
+
+add_docstr_all(
+    "pinverse",
+    r"""
+pinverse() -> Tensor
+
+See :func:`torch.pinverse`
+""",
+)
+
+add_docstr_all(
+    "index_add",
+    r"""
+index_add(dim, index, source, *, alpha=1) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_add_`.
+""",
+)
+
+add_docstr_all(
+    "index_copy",
+    r"""
+index_copy(dim, index, tensor2) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_copy_`.
+""",
+)
+
+add_docstr_all(
+    "index_fill",
+    r"""
+index_fill(dim, index, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+""",
+)
+
+add_docstr_all(
+    "scatter",
+    r"""
+scatter(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_`
+""",
+)
+
+add_docstr_all(
+    "scatter_add",
+    r"""
+scatter_add(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+""",
+)
+
+add_docstr_all(
+    "scatter_reduce",
+    r"""
+scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""",
+)
+
+add_docstr_all(
+    "masked_scatter",
+    r"""
+masked_scatter(mask, tensor) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
+
+.. note::
+
+    The inputs :attr:`self` and :attr:`mask`
+    :ref:`broadcast `.
+
+Example:
+
+    >>> self = torch.tensor([0, 0, 0, 0, 0])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "xlogy",
+    r"""
+xlogy(other) -> Tensor
+
+See :func:`torch.xlogy`
+""",
+)
+
+add_docstr_all(
+    "xlogy_",
+    r"""
+xlogy_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.xlogy`
+""",
+)
+
+add_docstr_all(
+    "masked_fill",
+    r"""
+masked_fill(mask, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_fill_`
+""",
+)
+
+add_docstr_all(
+    "grad",
+    r"""
+This attribute is ``None`` by default and becomes a Tensor the first time a call to
+:func:`backward` computes gradients for ``self``.
+The attribute will then contain the gradients computed and future calls to
+:func:`backward` will accumulate (add) gradients into it.
+""",
+)
+
+add_docstr_all(
+    "retain_grad",
+    r"""
+retain_grad() -> None
+
+Enables this Tensor to have their :attr:`grad` populated during
+:func:`backward`. This is a no-op for leaf tensors.
+""",
+)
+
+add_docstr_all(
+    "retains_grad",
+    r"""
+Is ``True`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+populated during :func:`backward`, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "requires_grad",
+    r"""
+Is ``True`` if gradients need to be computed for this Tensor, ``False`` otherwise.
+
+.. note::
+
+    The fact that gradients need to be computed for a Tensor do not mean that the :attr:`grad`
+    attribute will be populated, see :attr:`is_leaf` for more details.
+
+""",
+)
+
+add_docstr_all(
+    "is_leaf",
+    r"""
+All Tensors that have :attr:`requires_grad` which is ``False`` will be leaf Tensors by convention.
+
+For Tensors that have :attr:`requires_grad` which is ``True``, they will be leaf Tensors if they were
+created by the user. This means that they are not the result of an operation and so
+:attr:`grad_fn` is None.
+
+Only leaf Tensors will have their :attr:`grad` populated during a call to :func:`backward`.
+To get :attr:`grad` populated for non-leaf Tensors, you can use :func:`retain_grad`.
+
+Example::
+
+    >>> a = torch.rand(10, requires_grad=True)
+    >>> a.is_leaf
+    True
+    >>> b = torch.rand(10, requires_grad=True).cuda()
+    >>> b.is_leaf
+    False
+    # b was created by the operation that cast a cpu Tensor into a cuda Tensor
+    >>> c = torch.rand(10, requires_grad=True) + 2
+    >>> c.is_leaf
+    False
+    # c was created by the addition operation
+    >>> d = torch.rand(10).cuda()
+    >>> d.is_leaf
+    True
+    # d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+    >>> e = torch.rand(10).cuda().requires_grad_()
+    >>> e.is_leaf
+    True
+    # e requires gradients and has no operations creating it
+    >>> f = torch.rand(10, requires_grad=True, device="cuda")
+    >>> f.is_leaf
+    True
+    # f requires grad, has no operation creating it
+
+
+""",
+)
+
+add_docstr_all(
+    "names",
+    r"""
+Stores names for each of this tensor's dimensions.
+
+``names[idx]`` corresponds to the name of tensor dimension ``idx``.
+Names are either a string if the dimension is named or ``None`` if the
+dimension is unnamed.
+
+Dimension names may contain characters or underscore. Furthermore, a dimension
+name must be a valid Python variable name (i.e., does not start with underscore).
+
+Tensors may not have two named dimensions with the same name.
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "is_cuda",
+    r"""
+Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_cpu",
+    r"""
+Is ``True`` if the Tensor is stored on the CPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xla",
+    r"""
+Is ``True`` if the Tensor is stored on an XLA device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_ipu",
+    r"""
+Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xpu",
+    r"""
+Is ``True`` if the Tensor is stored on the XPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_quantized",
+    r"""
+Is ``True`` if the Tensor is quantized, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_meta",
+    r"""
+Is ``True`` if the Tensor is a meta tensor, ``False`` otherwise.  Meta tensors
+are like normal tensors, but they carry no data.
+""",
+)
+
+add_docstr_all(
+    "is_mps",
+    r"""
+Is ``True`` if the Tensor is stored on the MPS device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse",
+    r"""
+Is ``True`` if the Tensor uses sparse COO storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse_csr",
+    r"""
+Is ``True`` if the Tensor uses sparse CSR storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "device",
+    r"""
+Is the :class:`torch.device` where this Tensor is.
+""",
+)
+
+add_docstr_all(
+    "ndim",
+    r"""
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "itemsize",
+    r"""
+Alias for :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "nbytes",
+    r"""
+Returns the number of bytes consumed by the "view" of elements of the Tensor
+if the Tensor does not use sparse storage layout.
+Defined to be :meth:`~Tensor.numel()` * :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "T",
+    r"""
+Returns a view of this tensor with its dimensions reversed.
+
+If ``n`` is the number of dimensions in ``x``,
+``x.T`` is equivalent to ``x.permute(n-1, n-2, ..., 0)``.
+
+.. warning::
+    The use of :func:`Tensor.T` on tensors of dimension other than 2 to reverse their shape
+    is deprecated and it will throw an error in a future release. Consider :attr:`~.Tensor.mT`
+    to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse
+    the dimensions of a tensor.
+""",
+)
+
+add_docstr_all(
+    "H",
+    r"""
+Returns a view of a matrix (2-D tensor) conjugated and transposed.
+
+``x.H`` is equivalent to ``x.transpose(0, 1).conj()`` for complex matrices and
+``x.transpose(0, 1)`` for real matrices.
+
+.. seealso::
+
+        :attr:`~.Tensor.mH`: An attribute that also works on batches of matrices.
+""",
+)
+
+add_docstr_all(
+    "mT",
+    r"""
+Returns a view of this tensor with the last two dimensions transposed.
+
+``x.mT`` is equivalent to ``x.transpose(-2, -1)``.
+""",
+)
+
+add_docstr_all(
+    "mH",
+    r"""
+Accessing this property is equivalent to calling :func:`adjoint`.
+""",
+)
+
+add_docstr_all(
+    "adjoint",
+    r"""
+adjoint() -> Tensor
+
+Alias for :func:`adjoint`
+""",
+)
+
+add_docstr_all(
+    "real",
+    r"""
+Returns a new tensor containing real values of the :attr:`self` tensor for a complex-valued input tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+Returns :attr:`self` if :attr:`self` is a real-valued tensor tensor.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.real
+    tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+
+""",
+)
+
+add_docstr_all(
+    "imag",
+    r"""
+Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+.. warning::
+    :func:`imag` is only supported for tensors with complex dtypes.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.imag
+    tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+
+""",
+)
+
+add_docstr_all(
+    "as_subclass",
+    r"""
+as_subclass(cls) -> Tensor
+
+Makes a ``cls`` instance with the same data pointer as ``self``. Changes
+in the output mirror changes in ``self``, and the output stays attached
+to the autograd graph. ``cls`` must be a subclass of ``Tensor``.
+""",
+)
+
+add_docstr_all(
+    "crow_indices",
+    r"""
+crow_indices() -> IntTensor
+
+Returns the tensor containing the compressed row indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``crow_indices`` tensor is strictly of shape (:attr:`self`.size(0) + 1)
+and of type ``int32`` or ``int64``. When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.crow_indices()
+    tensor([0, 1, 2, 3, 4, 5], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "col_indices",
+    r"""
+col_indices() -> IntTensor
+
+Returns the tensor containing the column indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``col_indices`` tensor is strictly of shape (:attr:`self`.nnz())
+and of type ``int32`` or ``int64``.  When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.col_indices()
+    tensor([0, 1, 2, 3, 4], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "to_padded_tensor",
+    r"""
+to_padded_tensor(padding, output_size=None) -> Tensor
+See :func:`to_padded_tensor`
+""",
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..409dc8e42c59f53e637fb2e0e1cd19ac9e9163ef
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_utils.py
@@ -0,0 +1,937 @@
+import copyreg
+import functools
+import sys
+import traceback
+import warnings
+from collections import defaultdict
+from typing import Any, DefaultDict, List, Optional
+
+import torch
+
+
+def _type(self, dtype=None, non_blocking=False, **kwargs):
+    """Returns the type if `dtype` is not provided, else casts this object to
+    the specified type.
+
+    If this is already of the correct type, no copy is performed and the
+    original object is returned.
+
+    Args:
+        dtype (type or string): The desired type
+        non_blocking (bool): If ``True``, and the source is in pinned memory
+            and destination is on the GPU or vice versa, the copy is performed
+            asynchronously with respect to the host. Otherwise, the argument
+            has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument. The ``async`` arg is deprecated.
+    """
+    non_blocking = _get_async_or_non_blocking("type", non_blocking, kwargs)
+    if dtype is None:
+        return self.__module__ + "." + self.__class__.__name__
+
+    if isinstance(dtype, str):
+        dtype = _import_dotted_name(dtype)
+    if dtype == type(self):
+        return self
+    if self.is_sparse:
+        if not dtype.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_module_name = dtype.__module__.replace(".sparse", "")
+        new_values_type_name = new_module_name + "." + dtype.__name__
+        new_values = torch.Tensor._values(self).type(new_values_type_name, non_blocking)
+        new_indices_type_name = new_module_name + ".LongTensor"
+        new_indices = torch.Tensor._indices(self).type(
+            new_indices_type_name, non_blocking
+        )
+        return dtype(new_indices, new_values, self.size())
+    if dtype.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
+    return dtype(self.size()).copy_(self, non_blocking)
+
+
+def _hpu(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in HPU memory.
+
+    If this object is already in HPU memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination HPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("hpu", non_blocking, kwargs)
+    hpu = getattr(torch, "hpu", None)
+    assert hpu is not None, "HPU device module is not loaded"
+    if self.is_hpu:
+        if device is None:
+            device = hpu.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with hpu.device(device):
+        assert not self.is_sparse, "sparse storage is not supported for HPU tensors"
+        untyped_storage = torch.UntypedStorage(self.size(), device=torch.device("hpu"))
+        untyped_storage.copy_(self, non_blocking)
+        return untyped_storage
+
+
+def _cuda(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in CUDA memory.
+
+    If this object is already in CUDA memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination GPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("cuda", non_blocking, kwargs)
+    if self.is_cuda:
+        if device is None:
+            device = torch.cuda.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = torch.Tensor._indices(self).cuda(device, non_blocking)
+            values = torch.Tensor._values(self).cuda(device, non_blocking)
+            return new_type(indices, values, self.size())
+        else:
+            untyped_storage = torch.UntypedStorage(
+                self.size(), device=torch.device("cuda")
+            )
+            untyped_storage.copy_(self, non_blocking)
+            return untyped_storage
+
+
+def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
+    """Return the non-blocking flag given the function name and kwargs.
+
+    Args:
+        function_name (str): the name of the function being used.
+        non_blocking (bool): the default value.
+        **kwargs (dict): the kwargs passed to the function.
+    """
+    if not kwargs:
+        return non_blocking
+    if len(kwargs) != 1 or "async" not in kwargs:
+        message = "{}() got an unexpected keyword argument '{}'"
+        argument = list(kwargs.keys()).pop()
+        raise TypeError(message.format(function_name, argument))
+    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    return kwargs["async"]
+
+
+# Note [Don't serialize hooks]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Since time immemorial, we have serialized the backward hooks associated with
+# variables.  This kind of half-worked--Python can pickle global functions
+# (but not closures!)--but there were problems.
+#
+#   - It's fragile.  If you serialize a backward hook into a saved
+#     model, and then you rename the function associated with the hook,
+#     now your saved model is broken and you can't load it anymore.
+#
+#   - It's not actually used.  The standard recommendation is to
+#     serialize the *state_dict* of a model, not the model itself
+#     (since this is more stable to code changes affecting the model
+#     serialization), and the state dict saves "data" only, thus
+#     stripping the backward hooks.  In some cases, hooks are
+#     essential to the well-functioning of a model (e.g., DDP),
+#     but DDP already manages readding the hooks!
+#
+#   - We didn't serialize them in many cases.  Prior to #10220, we
+#     were dropping backward hooks in ForkingPickler.  We "fixed" this
+#     to be convenient with other serialization sites, but lack of
+#     serializing backward hooks wasn't actually the root cause of
+#     the bug.
+#
+# With these cases in mind, we have decided that a better strategy
+# is to just NOT serialize hooks at all.
+#
+# Since this is a BC-breaking change, we should warn when we previously
+# serialized a hook, but no longer do so. This will be done by adding a special
+# sentinel property to hooks will be used to suppress this warning. If a hook
+# has the property _torch_serialize_ignore, we will not emit a warning if we
+# attempt to serialize a Tensor with this hook attached to it.
+#
+# By the way, when _backward_hooks is skipped, we must give an EMPTY
+# OrderedDict(), if you pass a None you'll run afoul #12219.
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_tensor(storage, storage_offset, size, stride):
+    # first construct a tensor with the correct dtype/device
+    t = torch.empty((0,), dtype=storage.dtype, device=storage._untyped_storage.device)
+    return t.set_(storage._untyped_storage, storage_offset, size, stride)
+
+
+def get_tensor_metadata(tensor):
+    # Tensor's Metadata for serializing.
+    # Currently, this only returns a dict[string, bool] specifing whether
+    # `conj` or `neg` bit is set.
+    assert isinstance(tensor, torch.Tensor)
+    return torch._C._get_tensor_metadata(tensor)  # type: ignore[attr-defined]
+
+
+def set_tensor_metadata(tensor, metadata):
+    # See `get_tensor_metadata` above
+    assert isinstance(metadata, dict)
+    assert isinstance(tensor, torch.Tensor)
+    torch._C._set_tensor_metadata(tensor, metadata)  # type: ignore[attr-defined]
+
+
+def _rebuild_tensor_v2(
+    storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None
+):
+    tensor = _rebuild_tensor(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    if metadata:
+        set_tensor_metadata(tensor, metadata)
+
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_tensor_v3(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    requires_grad,
+    backward_hooks,
+    dtype,
+    metadata=None,
+):
+    t = torch.empty(
+        (0,),
+        dtype=dtype,
+        device=storage._untyped_storage.device,
+        requires_grad=requires_grad,
+    )
+    t.set_(storage._untyped_storage, storage_offset, size, stride)
+    if metadata:
+        set_tensor_metadata(t, metadata)
+    t._backward_hooks = backward_hooks
+    return t
+
+
+_sparse_tensors_to_validate: List["torch.Tensor"] = []
+
+
+# In _legacy_load() in serialization.py we unpickle storages after the sparse
+# tensors have been already unpickled. Those storages contain data necessary for
+# validating sparse tensors: indices and values. That's why sparse tensors are
+# first unpickled without any validation, and then this function is called just
+# before _legacy_load() returns, so that all the sparse tensors can be validated
+# in bulk.
+#
+# The same procedure must be followed by _load() in serialization.py because due
+# to Pickler semantics, we have to use the same (non-validating) function for
+# unpickling sparse tensors, regardless of the caller.
+def _validate_loaded_sparse_tensors():
+    try:
+        for t in _sparse_tensors_to_validate:
+            if t.layout is torch.sparse_coo:
+                torch._validate_sparse_coo_tensor_args(
+                    t._indices(), t._values(), t.size(), t.is_coalesced()
+                )
+            elif t.layout in {
+                torch.sparse_csr,
+                torch.sparse_csc,
+                torch.sparse_bsr,
+                torch.sparse_bsc,
+            }:
+                # TODO: Validation currently involves an expensive traversal
+                # on CPU, which may include a device transfer.
+                if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                    compressed_indices, plain_indices = (
+                        t.crow_indices(),
+                        t.col_indices(),
+                    )
+                else:
+                    compressed_indices, plain_indices = (
+                        t.ccol_indices(),
+                        t.row_indices(),
+                    )
+                torch._validate_sparse_compressed_tensor_args(
+                    compressed_indices, plain_indices, t.values(), t.size(), t.layout
+                )
+            else:
+                raise NotImplementedError(
+                    f"_validate_loaded_sparse_tensors for layout `{t.layout}`"
+                )
+
+    finally:
+        _sparse_tensors_to_validate.clear()
+
+
+def _rebuild_sparse_tensor(layout, data):
+    """
+    Rebuilds a sparse tensor from its sparse storage representation.
+
+    Args:
+        layout (str): The sparse storage layout of the tensor.
+        data (tuple): The tensor's sparse storage representation.
+    """
+    if layout == torch.sparse_coo:
+        if len(data) == 3:
+            # For BC:
+            indices, values, size = data
+            is_coalesced = None
+        else:
+            indices, values, size, is_coalesced = data
+        result = torch.sparse_coo_tensor(
+            indices, values, size, check_invariants=False, is_coalesced=is_coalesced
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    elif layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        compressed_indices, plain_indices, values, size = data
+        result = torch.sparse_compressed_tensor(
+            compressed_indices,
+            plain_indices,
+            values,
+            size,
+            layout=layout,
+            check_invariants=False,
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    raise NotImplementedError(f"rebuilding sparse tensor for layout {layout}")
+
+
+def _rebuild_nested_tensor(buffer, sizes, strides, storage_offsets):
+    return torch._nested_view_from_buffer(buffer, sizes, strides, storage_offsets)
+
+
+def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad):
+    tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
+    tensor.requires_grad = requires_grad
+    return tensor
+
+
+# Should not be used, only here to be able to load Tensors serialized with older versions of pytorch
+_rebuild_xla_tensor = _rebuild_device_tensor_from_numpy
+
+
+def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
+    return torch.empty_strided(
+        size, stride, dtype=dtype, device="meta", requires_grad=requires_grad
+    )
+
+
+def _rebuild_wrapper_subclass(
+    cls, dtype, size, stride, storage_offset, layout, device, requires_grad
+):
+    return torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+        cls,
+        size,
+        strides=stride,
+        storage_offset=storage_offset,
+        layout=layout,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_qtensor(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    quantizer_params,
+    requires_grad,
+    backward_hooks,
+):
+    qscheme = quantizer_params[0]
+    if qscheme == torch.per_tensor_affine:
+        _, scale, zero_point = quantizer_params
+        tensor = torch._empty_affine_quantized(
+            size,
+            scale=scale,
+            zero_point=zero_point,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    elif qscheme in (torch.per_channel_affine, torch.per_channel_affine_float_qparams):
+        _, scales, zero_points, axis = quantizer_params
+        if type(scales) is list and type(zero_points) is list:
+            if qscheme == torch.per_channel_affine:
+                scales = torch.tensor(scales, dtype=torch.double, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.long, device=storage.device
+                )
+            else:
+                scales = torch.tensor(scales, dtype=torch.float, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.float, device=storage.device
+                )
+        tensor = torch._empty_per_channel_affine_quantized(
+            size,
+            scales=scales,
+            zero_points=zero_points,
+            axis=axis,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    else:
+        raise RuntimeError(f"Can't deserialize quantized tensor with qscheme {qscheme}")
+    tensor.set_(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_parameter(data, requires_grad, backward_hooks):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    return param
+
+
+def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    # Restore state on Parameter like python attr.
+    param = _set_obj_state(param, state)
+    return param
+
+
+def _get_obj_state(obj):
+    # Get the state of the python subclass
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+    # from it, we cannot call that function directly
+    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    # Note that starting with Python 3.11, this `__getstate__` is always defined and thus
+    # the else branch will never be taken.
+    getstate_fn = getattr(obj, "__getstate__", None)
+    if getstate_fn:
+        state = getstate_fn()
+    else:
+        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
+        if slots_to_save:
+            state = (
+                obj.__dict__,
+                {
+                    name: getattr(obj, name)
+                    for name in slots_to_save
+                    if hasattr(obj, name)
+                },
+            )
+        else:
+            state = obj.__dict__
+
+    return state
+
+
+def _set_obj_state(obj, state):
+    if isinstance(state, tuple):
+        if not len(state) == 2:
+            raise RuntimeError(f"Invalid serialized state: {state}")
+        dict_state = state[0]
+        slots_state = state[1]
+    else:
+        dict_state = state
+        slots_state = None
+
+    # Starting with Python 3.11, the __dict__ attribute is lazily created
+    # and is serialized as None when not needed.
+    if dict_state:
+        for k, v in dict_state.items():
+            setattr(obj, k, v)
+
+    if slots_state:
+        for k, v in slots_state.items():
+            setattr(obj, k, v)
+    return obj
+
+
+def _import_dotted_name(name):
+    components = name.split(".")
+    obj = __import__(components[0])
+    for component in components[1:]:
+        obj = getattr(obj, component)
+    return obj
+
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+
+    Args:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    return torch._C._nn.flatten_dense_tensors(tensors)
+
+
+def _flatten_sparse_tensors(tensors):
+    """Flatten sparse tensors into two contiguous 1D buffers, one of indices and
+    one of values. Assume tensors are of same sparse type.
+
+    Args:
+        tensors (Iterable[Tensor]): sparse tensors to flatten.
+
+    Returns:
+        A tuple of two contiguous 1D buffers, one containing input tensors'
+        indices and the other containing the values.
+    """
+    flat_indices = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._indices(t) for t in tensors]
+    )
+    flat_values = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._values(t) for t in tensors]
+    )
+    return flat_indices, flat_values
+
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+
+    Args:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    return torch._C._nn.unflatten_dense_tensors(flat, tensors)
+
+
+def _unflatten_sparse_tensors(flat, tensors):
+    """View flat buffer (containing indices and values) using the sizes of
+    tensors. Assume that tensors are of same sparse type, and that flat is given
+    by _flatten_sparse_tensors.
+
+    Args:
+        flat (tuple(Tensor, Tensor)): flattened indices and values of sparse
+          tensors to unflatten.
+        tensors (Iterable[Tensor]): sparse tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened sparse tensors with sizes same as tensors and values from
+        flat.
+    """
+    flat_indices, flat_values = flat
+    indices = torch._C._nn.unflatten_dense_tensors(
+        flat_indices, [torch.Tensor._indices(t) for t in tensors]
+    )
+    values = torch._C._nn.unflatten_dense_tensors(
+        flat_values, [torch.Tensor._values(t) for t in tensors]
+    )
+    outputs = []
+    for t, i, v in zip(tensors, indices, values):
+        outputs.append(t.new(i, v, t.size()))
+    return tuple(outputs)
+
+
+def _reorder_tensors_as(tensors, ordered_tensors):
+    """Assume that tensors are of same order as ordered_tensors within their
+    types, e.g., from _take_tensors. Reorder them to be of same order as
+    ordered_tensors.
+
+    Args:
+        tensors (Iterable[Tensor]): tensors to be reordered. They should be of
+          the same order as ordered_tensors within their own types.
+        ordered_tensors (Iterable[Tensor]): tensors whose order will be the
+          reference.
+
+    Returns:
+        Ordered tuple of tensors with contents from tensors and order of
+        ordered_tensors.
+    """
+    type_dict = defaultdict(list)
+    for tensor in tensors:
+        type_dict[tensor.type()].append(tensor)
+    type_dict_ = {t: iter(coll) for t, coll in type_dict.items()}
+    return tuple(next(type_dict_[tensor.type()]) for tensor in ordered_tensors)
+
+
+def _take_tensors(tensors, size_limit):
+    """Group tensors into chunks. This generator yields a chunk at each time,
+    each containing tensors of same type up to certain byte limit in total size.
+
+    Args:
+        tensors (Sequence): A sequence of tensors to be separated into chunks.
+        size_limit (int): The limit of each chunk in bytes.
+
+    Yields:
+        Blocks of tensors of same type and within size_limit. The yielded
+        tensors are only ordered as the original sequence within its types.
+    """
+    buf_dict: DefaultDict[str, List] = defaultdict(lambda: [[], 0])
+    for tensor in tensors:
+        t = tensor.type()
+        if tensor.is_sparse:
+            indices = torch.Tensor._indices(tensor)
+            values = torch.Tensor._values(tensor)
+            size = (
+                indices.numel() * indices.element_size()
+                + values.numel() * values.element_size()
+            )
+        else:
+            size = tensor.numel() * tensor.element_size()
+        buf_and_size = buf_dict[t]
+        if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
+            yield buf_and_size[0]
+            buf_and_size = buf_dict[t] = [[], 0]
+        buf_and_size[0].append(tensor)
+        buf_and_size[1] += size
+    for buf, _ in buf_dict.values():
+        if len(buf) > 0:
+            yield buf
+
+
+# annotation decorator to get annotations in a way that is compatible
+# with both Python 2 and 3
+def annotate(ret, **kwargs):
+    def dec(fun):
+        fun.__annotations__ = dict(kwargs)
+        fun.__annotations__["return"] = ret
+        return fun
+
+    return dec
+
+
+def render_call(fn, args, kwargs):
+    str_fn = torch.overrides.resolve_name(fn)
+    if str_fn is None:
+        str_fn = str(fn)
+
+    str_args: List[str] = []
+    with torch._tensor_str.printoptions(threshold=0, edgeitems=0):
+        str_args.extend(repr(a) for a in args)
+        str_args.extend(f"{k}={repr(v)}" for k, v in kwargs.items())
+        r = f"{str_fn}({', '.join(str_args)})"
+    return r
+
+
+# NOTE [ Python Traceback Reference Cycle Problem ]
+#
+# When using sys.exc_info(), it is important to **not** store the exc_info[2],
+# which is the traceback, because otherwise you will run into the traceback
+# reference cycle problem, i.e., the traceback holding reference to the frame,
+# and the frame (which holds reference to all the object in its temporary scope)
+# holding reference the traceback.
+
+
+class KeyErrorMessage(str):
+    r"""str subclass that returns itself in repr"""
+
+    def __repr__(self):
+        return self
+
+
+class ExceptionWrapper:
+    r"""Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info=None, where="in background"):
+        # It is important that we don't store exc_info, see
+        # NOTE [ Python Traceback Reference Cycle Problem ]
+        if exc_info is None:
+            exc_info = sys.exc_info()
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+        self.where = where
+
+    def reraise(self):
+        r"""Reraises the wrapped exception in the current thread"""
+        # Format a message such as: "Caught ValueError in DataLoader worker
+        # process 2. Original Traceback:", followed by the traceback.
+        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"
+        if self.exc_type == KeyError:
+            # KeyError calls repr() on its argument (usually a dict key). This
+            # makes stack traces unreadable. It will not be changed in Python
+            # (https://bugs.python.org/issue2651), so we work around it.
+            msg = KeyErrorMessage(msg)
+        elif getattr(self.exc_type, "message", None):
+            # Some exceptions have first argument as non-str but explicitly
+            # have message field
+            raise self.exc_type(message=msg)
+        try:
+            exception = self.exc_type(msg)
+        except TypeError:
+            # If the exception takes multiple arguments, don't try to
+            # instantiate since we don't know how to
+            raise RuntimeError(msg) from None
+        raise exception
+
+
+def _get_available_device_type():
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
+        return "xpu"
+    custom_backend_name = torch._C._get_privateuse1_backend_name()
+    custom_device_mod = getattr(torch, custom_backend_name, None)
+    if custom_device_mod and custom_device_mod.is_available():
+        return custom_backend_name
+    # add more available device types here
+    return None
+
+
+def _get_device_attr(get_member):
+    device_type = _get_available_device_type()
+    if device_type and device_type.lower() == "cuda":
+        return get_member(torch.cuda)
+    if device_type and device_type.lower() == "xpu":
+        return get_member(torch.xpu)  # type: ignore[attr-defined]
+    if device_type == torch._C._get_privateuse1_backend_name():
+        return get_member(getattr(torch, device_type))
+    # add more available device types here
+    return None
+
+
+def _get_current_device_index():
+    # current device index
+    return _get_device_attr(lambda m: m.current_device())
+
+
+def _get_all_device_indices():
+    # all device index
+    return _get_device_attr(lambda m: list(range(m.device_count())))
+
+
+def _get_devices_properties(device_ids):
+    # all device properties
+    return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
+
+
+def get_current_device_index() -> int:
+    r"""Checks if there are CUDA devices available and
+    returns the device index of the current default CUDA device.
+    Returns -1 in case there are no CUDA devices available.
+    Arguments: ``None``
+    """
+    if torch.cuda.device_count() > 0:
+        return torch.cuda.current_device()
+    return -1
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    has index. Note that for a device without a specified index,
+    i.e., ``torch.device('xxx')``, this will return the current default
+    device of that type if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default
+    device of the supported runtime platform if :attr:`optional` is ``True``.
+    i.e., the current default CUDA device will be returned if CUDA runtime is supported.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    device_idx: Optional[int] = None
+    if isinstance(device, torch.device):
+        if not allow_cpu and device.type == "cpu":
+            raise ValueError(f"Expected a non cpu device, but got: {device}")
+        device_idx = -1 if device.type == "cpu" else device.index
+    if isinstance(device, int):
+        device_idx = device
+    if device_idx is None:
+        if optional:
+            # The eager API _get_current_device_index uses `lambda` functions which are
+            # not supported in JIT and hence not scriptable. The JIT equivalent API to get
+            # the current device index is `get_current_device_index()` which can
+            # be scripted. We use is_scripting to check the mode we are in and call the
+            # appropriate API.
+            if torch.jit.is_scripting():
+                device_idx = get_current_device_index()
+            else:
+                device_idx = _get_current_device_index()
+        else:
+            raise ValueError(
+                f"Expected a torch.device with a specified index or an integer, but got:{device}"
+            )
+    return device_idx
+
+
+def _handle_complex(tensor):
+    """
+    Returns a real view of a tensor if complex dtype else just the tensor
+    need to check if a UninitializedParameter because otherwise checking is_complex is an error for a LazyModule
+    """
+    return (
+        torch.view_as_real(tensor)
+        if not isinstance(tensor, torch.nn.UninitializedParameter)
+        and tensor.is_complex()
+        else tensor
+    )
+
+
+def _element_size(dtype):
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if not isinstance(dtype, torch.dtype):
+        raise RuntimeError(f"expected torch.dtype, but got {type(dtype)}")
+
+    if dtype.is_complex:
+        return torch.finfo(dtype).bits >> 2
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype).bits >> 3
+    elif dtype == torch.bool:
+        # NOTE: torch.bool is not supported in torch.iinfo()
+        return 1
+    else:
+        return torch.iinfo(dtype).bits >> 3
+
+
+class _ClassPropertyDescriptor:
+    def __init__(self, fget, fset=None):
+        self.fget = fget
+
+    def __get__(self, instance, owner=None):
+        if owner is None:
+            owner = type(instance)
+        return self.fget.__get__(instance, owner)()
+
+
+def classproperty(func):
+    if not isinstance(func, (classmethod, staticmethod)):
+        func = classmethod(func)
+    return _ClassPropertyDescriptor(func)
+
+
+def is_compiling() -> bool:
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    TODO(khabinov): we should deprecate this function and use torch.compiler.is_compiling().
+    """
+    return torch.compiler.is_compiling()
+
+
+def _functionalize_sync(t):
+    # This code lives in python instead of C++ since conditioning on a certain python subclass
+    # is much more of a pain in C++.
+    from torch._subclasses.functional_tensor import FunctionalTensor
+
+    if isinstance(t, FunctionalTensor):
+        # If a FunctionalTensorMode is active while syncing, we don't want it to intercept any ops that get called
+        # when we sync our inner tensor.
+        # Why?
+        # (1) If there are input mutations in the graph, then they will be re-applied during
+        #     AOTAutograd when we call _sync() from inside of our functionalization kernels.
+        # (2) _sync() causes us to regenerate our updated the tensor from the updated base,
+        #     which dispatches to a bunch of view ops
+        # (3) The input to these view ops is our inner FunctionalTensorWrapper
+        #     (since the sync was called from C++), not the python FunctionalTensor
+        # (4) if a python FunctionalTensorMode is active, it will complain when it intercepts
+        #     the view op, since it will see an input that is a C++ FunctionalTensorWrapper
+        #     (aka a normal torch.Tensor) instead of a python `FunctionalTensor).
+        maybe_functional_mode = torch._C._unset_dispatch_mode(
+            torch._C._TorchDispatchModeKey.FUNCTIONAL
+        )
+        try:
+            torch._functionalize_sync(t.elem)  # type: ignore[attr-defined]
+        finally:
+            if maybe_functional_mode is not None:
+                torch._C._set_dispatch_mode(maybe_functional_mode)
+    else:
+        torch._functionalize_sync(t)  # type: ignore[attr-defined]
+
+
+@functools.lru_cache(2)
+def _get_device_module(device_type: str):
+    device_module = getattr(torch, device_type, None)
+    if device_module is None:
+        raise RuntimeError(
+            f"Device '{device_type}' does not have a corresponding module registered as 'torch.{device_type}'."
+        )
+    return device_module
+
+
+def _dummy_type(name: str) -> type:
+    def get_err_fn(is_init: bool):
+        def err_fn(obj, *args, **kwargs):
+            if is_init:
+                class_name = obj.__class__.__name__
+            else:
+                class_name = obj.__name__
+            raise RuntimeError(f"Tried to instantiate dummy base class {class_name}")
+
+        return err_fn
+
+    return type(
+        name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)}
+    )
+
+
+class _LazySeedTracker:
+    # Since seeding is memory-less, only track the latest seed.
+    # Note: `manual_seed_all` followed by `manual_seed` overwrites
+    # the seed on current device. We track the order of **latest**
+    # calls between these two API.
+    def __init__(self):
+        self.manual_seed_all_cb = None
+        self.manual_seed_cb = None
+        self.call_order = []
+
+    def queue_seed_all(self, cb, traceback):
+        self.manual_seed_all_cb = (cb, traceback)
+        # update seed_all to be latest
+        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
+
+    def queue_seed(self, cb, traceback):
+        self.manual_seed_cb = (cb, traceback)
+        # update seed to be latest
+        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
+
+    def get_calls(self) -> List:
+        return self.call_order
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73a0b71c1a815be6b15d1972fed8350004d6721
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/torch_version.py
@@ -0,0 +1,58 @@
+# mypy: ignore-errors
+
+from typing import Any, Iterable
+from .version import __version__ as internal_version
+from ._vendor.packaging.version import Version, InvalidVersion
+
+__all__ = ['TorchVersion']
+
+
+class TorchVersion(str):
+    """A string with magic powers to compare to both Version and iterables!
+    Prior to 1.10.0 torch.__version__ was stored as a str and so many did
+    comparisons against torch.__version__ as if it were a str. In order to not
+    break them we have TorchVersion which masquerades as a str while also
+    having the ability to compare against both packaging.version.Version as
+    well as tuples of values, eg. (1, 2, 1)
+    Examples:
+        Comparing a TorchVersion object to a Version object
+            TorchVersion('1.10.0a') > Version('1.10.0a')
+        Comparing a TorchVersion object to a Tuple object
+            TorchVersion('1.10.0a') > (1, 2)    # 1.2
+            TorchVersion('1.10.0a') > (1, 2, 1) # 1.2.1
+        Comparing a TorchVersion object against a string
+            TorchVersion('1.10.0a') > '1.2'
+            TorchVersion('1.10.0a') > '1.2.1'
+    """
+    # fully qualified type names here to appease mypy
+    def _convert_to_version(self, inp: Any) -> Any:
+        if isinstance(inp, Version):
+            return inp
+        elif isinstance(inp, str):
+            return Version(inp)
+        elif isinstance(inp, Iterable):
+            # Ideally this should work for most cases by attempting to group
+            # the version tuple, assuming the tuple looks (MAJOR, MINOR, ?PATCH)
+            # Examples:
+            #   * (1)         -> Version("1")
+            #   * (1, 20)     -> Version("1.20")
+            #   * (1, 20, 1)  -> Version("1.20.1")
+            return Version('.'.join(str(item) for item in inp))
+        else:
+            raise InvalidVersion(inp)
+
+    def _cmp_wrapper(self, cmp: Any, method: str) -> bool:
+        try:
+            return getattr(Version(self), method)(self._convert_to_version(cmp))
+        except BaseException as e:
+            if not isinstance(e, InvalidVersion):
+                raise
+            # Fall back to regular string comparison if dealing with an invalid
+            # version like 'parrot'
+            return getattr(super(), method)(cmp)
+
+
+for cmp_method in ["__gt__", "__lt__", "__eq__", "__ge__", "__le__"]:
+    setattr(TorchVersion, cmp_method, lambda x, y, method=cmp_method: x._cmp_wrapper(y, method))
+
+__version__ = TorchVersion(internal_version)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3fa8b448f04f6bf16c695252e12dbc1bab046a1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/version.py
@@ -0,0 +1,8 @@
+from typing import Optional
+
+__all__ = ['__version__', 'debug', 'cuda', 'git_version', 'hip']
+__version__ = '2.3.0+cu118'
+debug = False
+cuda: Optional[str] = '11.8'
+git_version = '97ff6cfd9c86c5c09d7ce775ab64ec5c99230f5d'
+hip: Optional[str] = None