diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48e0e51a793cd861c2346fa4682ef23554d6c3eb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c32fc9ef2d7970affae52c1e6723beb52e293
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4412a4d2ea0fc160a31f9d59ee0afebaff771bf4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..203eaef35d3ac024c9c9cfc316a88d24b13107d5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -0,0 +1,258 @@
+import functools
+import logging
+import os
+import sys
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import sympy
+
+import torch
+
+from ...codecache import cache_dir
+from ...config import cuda as inductor_cuda_config
+from ...ir import Layout
+from .cuda_env import get_cuda_arch, get_cuda_version
+
+log = logging.getLogger(__name__)
+
+
+def _rename_cutlass_import(content: str, cutlass_modules: List[str]) -> str:
+    for cutlass_module in cutlass_modules:
+        content = content.replace(
+            f"from {cutlass_module} import ",
+            f"from cutlass_library.{cutlass_module} import ",
+        )
+    return content
+
+
+def _gen_cutlass_file(
+    file_name: str, cutlass_modules: List[str], src_dir: str, dst_dir: str
+) -> None:
+    orig_full_path = os.path.abspath(os.path.join(src_dir, file_name))
+    text = ""
+    with open(orig_full_path) as f:
+        text = f.read()
+    text = _rename_cutlass_import(text, cutlass_modules)
+    dst_full_path = os.path.abspath(
+        os.path.join(
+            dst_dir,
+            file_name,
+        )
+    )
+    with open(dst_full_path, "w") as f:
+        f.write(text)
+
+
+@functools.lru_cache(None)
+def try_import_cutlass() -> bool:
+    # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path.
+    # This is a temporary hack to avoid CUTLASS module naming conflicts.
+    # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
+
+    cutlass_py_full_path = os.path.abspath(
+        os.path.join(inductor_cuda_config.cutlass_dir, "python/cutlass_library")
+    )
+    tmp_cutlass_py_full_path = os.path.abspath(
+        os.path.join(cache_dir(), "torch_cutlass_library")
+    )
+    dst_link = os.path.join(tmp_cutlass_py_full_path, "cutlass_library")
+
+    if os.path.isdir(cutlass_py_full_path):
+        if tmp_cutlass_py_full_path not in sys.path:
+            if os.path.exists(dst_link):
+                assert os.path.islink(
+                    dst_link
+                ), f"{dst_link} is not a symlink. Try to remove {dst_link} manually and try again."
+                assert os.path.realpath(os.readlink(dst_link)) == os.path.realpath(
+                    cutlass_py_full_path
+                ), f"Symlink at {dst_link} does not point to {cutlass_py_full_path}"
+            else:
+                os.makedirs(tmp_cutlass_py_full_path, exist_ok=True)
+                os.symlink(cutlass_py_full_path, dst_link)
+            sys.path.append(tmp_cutlass_py_full_path)
+        try:
+            import cutlass_library.generator  # noqa: F401
+            import cutlass_library.library  # noqa: F401
+            import cutlass_library.manifest  # noqa: F401
+
+            return True
+
+        except ImportError as e:
+            log.debug(
+                "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.",
+                str(e),
+            )
+    else:
+        log.debug(
+            "Failed to import CUTLASS packages: CUTLASS repo does not exist: %s",
+            cutlass_py_full_path,
+        )
+    return False
+
+
+def _normalize_cuda_arch(arch: str) -> str:
+    if int(arch) >= 90:
+        return "90"
+    elif int(arch) >= 80:
+        return "80"
+    elif int(arch) >= 75:
+        return "75"
+    elif int(arch) >= 70:
+        return "70"
+    else:
+        raise NotImplementedError(f"Unsupported cuda arch: {arch}")
+
+
+@dataclass
+class CUTLASSArgs:
+    """
+    CUTLASS args used to initialize a CUTLASS Manifest.
+    """
+
+    architectures: Optional[str] = None
+    cuda_version: Optional[str] = None
+
+    operations = "all"
+    build_dir = ""
+    curr_build_dir = ""
+    generator_target = ""
+    kernels = "all"
+    ignore_kernels = ""
+    # TODO: these three look dead?
+    kernel_filter_file: None = None
+    selected_kernel_list: None = None
+    interface_dir: None = None
+    filter_by_cc = True
+    disable_full_archs_compilation = False
+
+    def __post_init__(self):
+        if self.architectures is None or self.cuda_version is None:
+            raise RuntimeError(
+                f"{self.architectures=} or {self.cuda_version=} is None!"
+            )
+        self.architectures = _normalize_cuda_arch(self.architectures)
+
+
+@functools.lru_cache(None)
+def _gen_ops_cached(arch, version) -> List[Any]:
+    # Note: Cache needs to be specific for cuda architecture and version
+
+    # Import cutlass python scripts.
+    assert try_import_cutlass()
+    import cutlass_library.generator as cutlass_generator
+    import cutlass_library.manifest as cutlass_manifest
+
+    if arch is None or version is None:
+        log.error(
+            "Cannot detect cuda arch %s or cuda version %s. "
+            "Will discard all cutlass ops. "
+            "Please consider setting _inductor.cuda.arch and _inductor.cuda.version configs.",
+            arch,
+            version,
+        )
+        return list()
+    arch = _normalize_cuda_arch(arch)
+    args = CUTLASSArgs(architectures=arch, cuda_version=version)
+    manifest = cutlass_manifest.Manifest(args)
+
+    if arch == "90":
+        cutlass_generator.GenerateSM90(manifest, args.cuda_version)
+        cutlass_generator.GenerateSM80(manifest, args.cuda_version)
+    else:
+        try:
+            func = getattr(cutlass_generator, "GenerateSM" + arch)
+            func(manifest, args.cuda_version)
+        except AttributeError as e:
+            raise NotImplementedError(
+                "Arch " + arch + " is not supported by current cutlass lib."
+            ) from e
+    return manifest.operations
+
+
+def gen_ops() -> List[Any]:
+    """
+    Generates all supported CUTLASS operations.
+    """
+    arch = get_cuda_arch()
+    version = get_cuda_version()
+    return _gen_ops_cached(arch, version)
+
+
+def dtype_match(
+    torch_dtype: Optional[torch.dtype],
+    cutlass_dtype: "cutlass_library.library.DataType",  # type: ignore[name-defined]  # noqa: F821
+) -> bool:
+    # Import cutlass python scripts.
+    assert try_import_cutlass()
+    import cutlass_library
+
+    if torch_dtype == torch.float:
+        return (
+            cutlass_dtype == cutlass_library.library.DataType.f32
+            or cutlass_dtype == cutlass_library.library.DataType.tf32
+        )
+    elif torch_dtype == torch.half:
+        return cutlass_dtype == cutlass_library.library.DataType.f16
+    elif torch_dtype == torch.bfloat16:
+        return cutlass_dtype == cutlass_library.library.DataType.bf16
+    else:
+        return False
+
+
+def get_accumulator_dtype(
+    input_torch_dtypes: List[torch.dtype],
+) -> Optional[torch.dtype]:
+    """
+    Given a list of input torch dtypes, returns the inferred accumulator torch dtype.
+    """
+
+    if len(input_torch_dtypes) == 0:
+        return None
+    torch_dtype = input_torch_dtypes[0]
+    for dtype in input_torch_dtypes[1:]:
+        if torch_dtype != dtype:
+            raise RuntimeError(f"Unmatched input dtypes: {torch_dtype=}, {dtype=}")
+    if torch_dtype == torch.half:
+        if torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction:
+            return torch_dtype
+        else:
+            return torch.float
+    if torch_dtype in {torch.bfloat16, torch.float}:
+        return torch.float
+    raise NotImplementedError(f"Unsupported data type: {input_torch_dtypes=}")
+
+
+def get_alignments(torch_dtype: torch.dtype) -> List[int]:
+    """
+    Returns all possible valid CUTLASS alignments in terms of the number of elements for a given dtype.
+    CUTLASS gemm / conv SM80 APIs support 16 bytes max alignment, and 2 bytes min alignment.
+    """
+
+    if torch_dtype in (torch.half, torch.bfloat16):
+        return [8, 4, 2, 1]
+    elif torch_dtype == torch.float:
+        return [4, 2, 1]
+    else:
+        raise NotImplementedError(f"unsupported {torch_dtype=} for alignments")
+
+
+def get_max_alignment(inductor_layout: Layout) -> int:
+    """
+    Returns the max alignment (in terms of number of elements) for a given Inductor Layout.
+    """
+
+    dtype = inductor_layout.dtype
+    size = inductor_layout.size
+    offset = inductor_layout.offset
+
+    def is_static_int(number):
+        return isinstance(number, (int, sympy.Integer))
+
+    if is_static_int(size[-1]) and is_static_int(offset):
+        alignments = get_alignments(dtype)
+        for alignment in alignments:
+            if int(size[-1]) % alignment == 0 and int(offset) % alignment == 0:
+                return alignment
+
+    return 1
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0306cf2d389f35cfca0d949b339ed629162ab2e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py
@@ -0,0 +1,3931 @@
+from __future__ import annotations
+
+import collections
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import operator
+import os
+import textwrap
+from functools import lru_cache
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Counter,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import sympy
+
+import torch
+import torch._logging
+
+from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata
+from torch._prims_common import is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import ValueRanges
+from torch.utils._triton import has_triton_package
+
+from ..._dynamo.utils import counters
+from .. import config, ir, scheduler
+from ..codecache import code_hash, get_path, PyCodeCache
+from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
+from ..ir import IRNode, ReductionHint, TritonTemplateBuffer
+from ..optimize_indexing import indexing_dtype_strength_reduction
+from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
+from ..triton_heuristics import AutotuneHint
+from ..utils import (
+    cache_on_self,
+    do_bench,
+    get_dtype_size,
+    get_fused_kernel_name,
+    get_kernel_metadata,
+    get_max_y_grid,
+    green_text,
+    is_welford_reduction,
+    next_power_of_2,
+    Placeholder,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    sympy_subs,
+    unique,
+    yellow_text,
+)
+from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V
+from ..wrapper_benchmark import get_kernel_category_by_source_code
+from .common import (
+    CSE,
+    CSEVariable,
+    DeferredLine,
+    free_symbol_startswith,
+    IndentedBuffer,
+    index_prevent_reordering,
+    Kernel,
+    OpOverrides,
+    PythonPrinter,
+    SizeArg,
+    TensorArg,
+)
+from .multi_kernel import MultiKernel
+from .triton_utils import config_of, signature_of, signature_to_meta
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
+
+
+@lru_cache(None)
+def gen_attr_descriptor_import():
+    """
+    import AttrsDescriptor if the triton version is new enough to have this
+    class defined.
+    """
+    if not has_triton_package():
+        return ""
+
+    import triton.compiler.compiler
+
+    if hasattr(triton.compiler.compiler, "AttrsDescriptor"):
+        return "from triton.compiler.compiler import AttrsDescriptor"
+    else:
+        return ""
+
+
+@lru_cache(None)
+def gen_common_triton_imports():
+    imports = IndentedBuffer()
+    imports.splice(
+        """
+        import triton
+        import triton.language as tl
+        """
+    )
+    if attr_desc := gen_attr_descriptor_import():
+        imports.writeline(attr_desc)
+
+    imports.splice(
+        """
+        from torch._inductor import triton_helpers, triton_heuristics
+        from torch._inductor.ir import ReductionHint, TileHint
+        from torch._inductor.triton_helpers import libdevice, math as tl_math
+        from torch._inductor.triton_heuristics import AutotuneHint
+        from torch._inductor.utils import instance_descriptor
+        """
+    )
+    return imports.getvalue()
+
+
+@dataclasses.dataclass
+class IndexingOptions:
+    index_str: str
+    mask_vars: Set[sympy.Symbol]
+    mask_str: str
+    expand_str: Optional[str]
+    _has_rindex: bool
+
+    def has_mask(self):
+        return bool(self.mask_vars)
+
+    def has_rindex(self):
+        return self._has_rindex
+
+    def has_tmpmask(self):
+        return "tmp" in self.mask_str
+
+    def has_rmask(self):
+        return "rmask" in self.mask_str
+
+
+@dataclasses.dataclass
+class BlockPtrOptions:
+    constant_offset: sympy.Expr
+    shape: List[sympy.Expr]
+    strides: List[sympy.Expr]
+    block_shape: List[str]
+    order: List[int]
+    offsets: List[str]
+    mask_vars: Set[sympy.Symbol]
+    reshape_suffix: List[str]
+
+    @staticmethod
+    def create(
+        strides: List[sympy.Expr],
+        constant_offset: sympy.Expr,
+        range_trees: List[IterationRangesEntry],
+        mask_vars: Set[sympy.Symbol],
+    ) -> BlockPtrOptions:
+        """Helper to create a  BlockPtrOptions instance"""
+        block_shape = [f"{t.prefix.upper()}BLOCK" for t in range_trees]
+        reshape_suffix = [*block_shape]
+
+        broadcasting_dim = [s == 0 for s in strides]
+        for i, is_broadcasting in enumerate(broadcasting_dim):
+            if is_broadcasting:
+                # drop any stride==0 dimensions for performance
+                reshape_suffix[i] = "1"
+
+        if V.kernel.no_x_dim:
+            assert range_trees[0].prefix == "x"
+            reshape_suffix.pop(0)
+
+        if (
+            not V.kernel.inside_reduction
+            and len(strides) == len(V.kernel.numels) - 1
+            and V.kernel.numels[-1] != 1
+        ):
+            # Need to expand rank by 1 to match rank when self.inside_reduction=True
+            reshape_suffix.append("1")
+
+        def filter(it):
+            """Removes any broadcasting dims from a given sequence"""
+            assert len(it) == len(broadcasting_dim)
+            return [
+                item
+                for item, is_broadcasting in zip(it, broadcasting_dim)
+                if not is_broadcasting
+            ]
+
+        return BlockPtrOptions(
+            constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
+            shape=[
+                V.graph.sizevars.lookup_precomputed_size(t.numel)
+                for t in filter(range_trees)
+            ],
+            strides=[*map(V.graph.sizevars.lookup_precomputed_size, filter(strides))],
+            block_shape=filter(block_shape),
+            order=V.graph.sizevars.guarded_order(filter(strides)),
+            offsets=filter([f"{t.prefix}offset" for t in range_trees]),
+            mask_vars=mask_vars,
+            reshape_suffix=reshape_suffix,
+        )
+
+    def format(self, name: str, roffset=True) -> str:
+        """
+        Codegen a call to tl.make_block_ptr()
+
+        Args:
+            name: variable name for pointer
+            roffset: should roffset be included in offsets=..., for use with tl.advance()
+
+        Returns:
+            "tl.make_block_ptr(...)"
+        """
+        f = V.kernel.index_to_str
+        offsets = [*self.offsets]
+        if not roffset:
+            offsets[offsets.index("roffset")] = "0"
+        args = [
+            f"{name} + ({f(self.constant_offset)})"
+            if self.constant_offset != 0
+            else name,
+            f"shape={f(self.shape)}",
+            f"strides={f(self.strides)}",
+            f"block_shape={f(self.block_shape)}",
+            f"order={f(self.order)}",
+            f"offsets={f(offsets)}",
+        ]
+        return f"tl.make_block_ptr({', '.join(args)})"
+
+    @cache_on_self
+    def boundary_check(self) -> List[int]:
+        """List of indices to pass to tl.load(boundary_check=...)"""
+        check = []
+        for i in range(len(self.shape)):
+            if (
+                self.block_shape[i] != "1"
+                and not V.graph.sizevars.statically_known_equals(self.strides[i], 0)  # type: ignore[arg-type]
+                and not V.graph.sizevars.statically_known_multiple_of(
+                    self.shape[i],
+                    config.triton.max_block[self.block_shape[i][0]],  # type: ignore[arg-type]
+                )
+                and not (V.kernel.no_x_dim and self.block_shape[i] == "XBLOCK")
+            ):
+                check.append(i)
+        return check
+
+    def advance_roffset(self):
+        """Codegen string to pass to tl.advance(name, ...)"""
+        advance = ["0"] * len(self.shape)
+        advance[self.offsets.index("roffset")] = "RBLOCK"
+        return V.kernel.index_to_str(advance)
+
+    def has_rindex(self):
+        return "RBLOCK" in self.block_shape
+
+    def has_rmask(self):
+        return self.has_rindex()
+
+    def has_tmpmask(self):
+        return False  # block_ptr can't do indirect indexing
+
+    def has_mask(self):
+        return bool(self.boundary_check())
+
+
+def triton_reshape(value: str, old_shape: List[str], new_shape: List[str]):
+    """Workaround https://github.com/openai/triton/issues/2836"""
+    assert isinstance(old_shape, list) and isinstance(new_shape, list)
+    if old_shape == new_shape:
+        return value
+    if [s for s in new_shape if s != "1"] != old_shape:
+        return f"tl.reshape({value}, [{', '.join(new_shape)}])"
+    # rewrite to [:, None] syntax, which is less buggy
+    idx = 0
+    expand = []
+    for size in new_shape:
+        if idx < len(old_shape) and size == old_shape[idx]:
+            expand.append(":")
+            idx += 1
+        else:
+            assert size == "1"
+            expand.append("None")
+    assert idx == len(old_shape)
+    return f"{value}[{', '.join(expand)}]"
+
+
+class TritonPrinter(PythonPrinter):
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        return (
+            f"libdevice.floor({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+
+    def _helper_sqrt(self, expr):
+        return f"libdevice.sqrt({self._print(expr)}.to(tl.float32))"
+
+    def _print_Where(self, expr):
+        c = self.doprint(expr.args[0])
+        p = self.doprint(expr.args[1])
+        q = self.doprint(expr.args[2])
+        return f"tl.where({c}, {p}, {q})"
+
+    def _print_Min(self, expr):
+        nargs = len(expr.args)
+        if len(expr.args) == 1:
+            return self._print(expr.args[0])
+
+        mid = len(expr.args) // 2
+        a = self._print(sympy.Min(*expr.args[:mid]))
+        b = self._print(sympy.Min(*expr.args[mid:]))
+        return f"tl.minimum({a}, {b})"
+
+    def _print_Max(self, expr):
+        nargs = len(expr.args)
+        if len(expr.args) == 1:
+            return self._print(expr.args[0])
+
+        mid = len(expr.args) // 2
+        a = self._print(sympy.Max(*expr.args[:mid]))
+        b = self._print(sympy.Max(*expr.args[mid:]))
+
+        return f"tl.maximum({a}, {b})"
+
+    def _print_Abs(self, expr):
+        assert len(expr.args) == 1
+        return f"tl_math.abs({self._print(expr.args[0])})"
+
+    def _print_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cosh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.acos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sinh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.asin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tan(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tanh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.atan(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_FloorDiv(self, expr):
+        if expr.is_integer:
+            return super()._print_FloorDiv(expr)
+
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        return f"libdevice.floor({x} / {div}).to({V.kernel.index_dtype})"
+
+    def _print_Round(self, expr):
+        assert len(expr.args) == 1
+        return (
+            f"libdevice.llrint({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
+
+    def _print_RoundDecimal(self, expr):
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        if number.is_integer:
+            # ndigits < 0 should have been filtered by the sympy function
+            assert ndigits < 0
+            raise ValueError(
+                f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
+            )
+        return f"libdevice.nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits}"
+
+
+texpr = TritonPrinter().doprint
+pexpr = PythonPrinter().doprint
+
+
+def triton_compute_type(dtype):
+    triton_type_name = str(dtype).split(".")[-1]
+    if triton_type_name == "bool":
+        triton_type_name = "int1"
+    elif triton_type_name in ("float16", "bfloat16"):
+        # float16 math is done in float32 inside the kernel
+        triton_type_name = "float32"
+    elif triton_type_name == "float8_e4m3fn":
+        triton_type_name = "float8e4nv"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5"
+    elif triton_type_name == "float8_e4m3fnuz":
+        triton_type_name = "float8e4b8"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5b16"
+    return f"tl.{triton_type_name}"
+
+
+def triton_store_type(dtype):
+    triton_type_name = str(dtype).split(".")[-1]
+    if triton_type_name == "bool":
+        triton_type_name = "int8"
+    elif triton_type_name == "float8_e4m3fn":
+        triton_type_name = "float8e4nv"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5"
+    return f"tl.{triton_type_name}"
+
+
+def triton_acc_type(dtype):
+    if is_integer_dtype(dtype) and dtype.is_signed:
+        nbits = 64 if dtype == torch.int64 else 32
+        return f"tl.int{nbits}"
+    return triton_compute_type(dtype)
+
+
+def triton_constant(value):
+    if value == float("inf"):
+        return 'float("inf")'
+    elif value == float("-inf"):
+        return 'float("-inf")'
+    elif math.isnan(value):
+        return 'float("nan")'
+    return repr(value)
+
+
+class TritonCSEVariable(CSEVariable):
+    def __init__(self, name, bounds: ValueRanges[Any]):
+        super().__init__(name, bounds)
+        # We'll use this to track which masks the variable needs when used for indirect indexing
+        self.mask_vars: Set[str] = set()
+
+    def update_on_args(self, name, args, kwargs):
+        # When making a variable that is going to be used in indirect indexing
+        # if a where clause is used it should mean that the result is always a
+        # valid index, so you shouldn't include any of the dependent variables
+        # in the resulting load mask
+        if name == "where":
+            return
+        for arg in args:
+            if isinstance(arg, TritonCSEVariable):
+                self.mask_vars.update(arg.mask_vars)
+            elif isinstance(arg, sympy.Symbol) and arg.name[0] in "xyr":
+                # most of the time index vars don't need masks associated with them
+                # however, when index vars are used to compute indices for indirect reads
+                # those reads should subsequently be masked,
+                self.mask_vars.update({f"{arg.name[0]}mask"})
+
+    def __repr__(self):
+        return f"TritonCSEVariable(name={self.name})"
+
+
+class TritonOverrides(OpOverrides):
+    """Map element-wise ops to Triton"""
+
+    @staticmethod
+    def to_dtype(x, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None):
+        def _get_min_elements_per_thread(
+            src_dtype: torch.dtype, dst_dtype: torch.dtype
+        ) -> int:
+            if src_dtype == dst_dtype:
+                # No data type conversion is needed. No requirements on min_elem_per_thread.
+                return 0
+
+            # fp8 data type conversions has min_elem_per_thread requirements.
+            # Refer to Triton implementations here:
+            # https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
+            fp8_dtypes = {
+                torch.float8_e4m3fn,
+                torch.float8_e5m2,
+            }
+            # Triton doesn't support type conversions between fp8_e4m3 and fp8_e5m2.
+            assert not (
+                src_dtype in fp8_dtypes
+                and dst_dtype in fp8_dtypes
+                and src_dtype != dst_dtype
+            ), "Conversions between float8_e5m2 and float8_e4m3fn is not supported!"
+            if src_dtype == torch.float8_e5m2 or dst_dtype == torch.float8_e5m2:
+                return 4
+            if src_dtype == torch.float8_e4m3fn or dst_dtype == torch.float8_e4m3fn:
+                return 2
+            # No requirements on min_elem_per_thread.
+            return 0
+
+        if src_dtype is not None:
+            # Both dtype and src_dtype are set. This is used by torch to(dtype=dtype).
+            # It takes the maximum min_elem_per_thread if there are multiple fp8 conversions
+            # in the same kernel.
+            V.kernel.min_elem_per_thread = max(
+                _get_min_elements_per_thread(src_dtype, dtype),
+                V.kernel.min_elem_per_thread,
+            )
+
+        if dtype == torch.bool:
+            return f"({x} != 0)"
+        elif dtype == torch.uint8:
+            # to work around llvm uint conversion semantics
+            # that produces 0's for negative values
+            return f"{x}.to(tl.int8).to(tl.uint8)"
+        return f"{x}.to({triton_compute_type(dtype)})"
+
+    @staticmethod
+    def to_dtype_bitcast(x, dtype: torch.dtype, src_dtype: torch.dtype):
+        triton_dtype = triton_compute_type(dtype)
+        # We may promote float16 or bfloat16 to float32 and cause the
+        # bitwidth of dtype to be different from the input tensor (i.e. float32).
+        # In such as case, we will have to convert the input tensor to
+        # its src_type, perform bitcast, and then convert the bit-casted
+        # tensor back to float to ensure we use values with the right precision.
+        if src_dtype in (torch.float16, torch.bfloat16):
+            triton_src_dtype = str(src_dtype).split(".")[-1]
+            cast_x = f"{x}.to(tl.{triton_src_dtype})"
+            cast_x = f"{cast_x}.to({triton_dtype}, bitcast=True)"
+            return f"{cast_x}.to(tl.float32)"
+        else:
+            return f"{x}.to({triton_dtype}, bitcast=True)"
+
+    @staticmethod
+    def _shaped_constant(value, dtype, shape):
+        type_ = torch._prims_common.dtype_to_type(dtype)
+        triton_val = triton_constant(type_(value))
+        triton_type = triton_compute_type(dtype)
+
+        if triton_type == "tl.float32":
+            # Float constants are always f32 in triton
+            return triton_val
+
+        # NOTE: We use a tensor here in order to get the expected type.
+        # Otherwise, e.g. float64 constants would be trunctated to float32.
+        return f"tl.full({shape}, {triton_val}, {triton_type})"
+
+    @classmethod
+    def constant(cls, value, dtype):
+        return cls._shaped_constant(value, dtype, shape=[])
+
+    @staticmethod
+    def abs(x):
+        return f"tl_math.abs({x})"
+
+    @staticmethod
+    def libdevice_abs(x):
+        return f"libdevice.abs({x})"
+
+    @staticmethod
+    def exp(x):
+        return f"tl_math.exp({x})"
+
+    @staticmethod
+    def libdevice_exp(x):
+        return f"libdevice.exp({x})"
+
+    @staticmethod
+    def exp2(x):
+        return f"libdevice.exp2({x})"
+
+    @staticmethod
+    def expm1(x):
+        return f"libdevice.expm1({x})"
+
+    @staticmethod
+    def sqrt(x):
+        return f"libdevice.sqrt({x})"
+
+    @staticmethod
+    def libdevice_sqrt(x):
+        return f"libdevice.sqrt({x})"
+
+    @staticmethod
+    def relu(x):
+        bug = config.triton.inject_relu_bug_TESTING_ONLY
+        if bug == "compile_error":
+            return "compile error!"
+        elif bug == "runtime_error":
+            # NB: this only triggers runtime error as long as input
+            # is not all zero
+            return f'triton_helpers.device_assert_then({x} == 0, "injected assert fail", {x})'
+        elif bug == "accuracy":
+            return f"{x} + 1"
+        elif bug is None:
+            return ops.maximum("0", x)
+        else:
+            raise AssertionError(
+                f"unrecognized config triton.inject_relu_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    @staticmethod
+    def minimum(a, b):
+        return f"triton_helpers.minimum({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"triton_helpers.maximum({a}, {b})"
+
+    @staticmethod
+    def where(a, b, c):
+        return f"tl.where({a}, {b}, {c})"
+
+    @staticmethod
+    def cos(x):
+        return f"tl_math.cos({x})"
+
+    @staticmethod
+    def libdevice_cos(x):
+        return f"libdevice.cos({x})"
+
+    @staticmethod
+    def sin(x):
+        return f"tl_math.sin({x})"
+
+    @staticmethod
+    def libdevice_sin(x):
+        return f"libdevice.sin({x})"
+
+    @classmethod
+    def index_expr(cls, expr, dtype):
+        raise NotImplementedError("ops.index_expr not implemented outside a kernel")
+
+    @staticmethod
+    def masked(mask, body, other):
+        raise NotImplementedError("ops.masked not implemented outside a kernel")
+
+    @staticmethod
+    def lgamma(x):
+        return f"libdevice.lgamma({x})"
+
+    @staticmethod
+    def erf(x):
+        return f"libdevice.erf({x})"
+
+    @staticmethod
+    def cosh(x):
+        return f"libdevice.cosh({x})"
+
+    @staticmethod
+    def sinh(x):
+        return f"libdevice.sinh({x})"
+
+    @staticmethod
+    def acos(x):
+        return f"libdevice.acos({x})"
+
+    @staticmethod
+    def acosh(x):
+        return f"libdevice.acosh({x})"
+
+    @staticmethod
+    def asin(x):
+        return f"libdevice.asin({x})"
+
+    @staticmethod
+    def asinh(x):
+        return f"libdevice.asinh({x})"
+
+    @staticmethod
+    def atan2(x, y):
+        return f"libdevice.atan2({x}, {y})"
+
+    @staticmethod
+    def atan(x):
+        return f"libdevice.atan({x})"
+
+    @staticmethod
+    def atanh(x):
+        return f"libdevice.atanh({x})"
+
+    @staticmethod
+    def copysign(x, y):
+        return f"libdevice.copysign({x}, {y})"
+
+    @staticmethod
+    def erfc(x):
+        return f"libdevice.erfc({x})"
+
+    @staticmethod
+    def erfinv(x):
+        return f"libdevice.erfinv({x})"
+
+    @staticmethod
+    def hypot(x, y):
+        return f"libdevice.hypot({x}, {y})"
+
+    @staticmethod
+    def log10(x):
+        return f"libdevice.log10({x})"
+
+    @staticmethod
+    def nextafter(x, y):
+        return f"libdevice.nextafter({x}, {y})"
+
+    @staticmethod
+    def logical_and(a, b):
+        return f"{a} & {b}"
+
+    @staticmethod
+    def logical_not(a):
+        return f"{a} == 0"
+
+    @staticmethod
+    def logical_or(a, b):
+        return f"{a} | {b}"
+
+    @staticmethod
+    def logical_xor(a, b):
+        return f"({a} ^ {b})"
+
+    @staticmethod
+    def bitwise_and(a, b):
+        return f"{a} & {b}"
+
+    @staticmethod
+    def bitwise_not(a):
+        return f"~{a}"
+
+    @staticmethod
+    def bitwise_or(a, b):
+        return f"{a} | {b}"
+
+    @staticmethod
+    def bitwise_xor(a, b):
+        return f"{a} ^ {b}"
+
+    @staticmethod
+    def bitwise_left_shift(a, b):
+        return f"{a} << {b}"
+
+    @staticmethod
+    def bitwise_right_shift(a, b):
+        return f"{a} >> {b}"
+
+    @staticmethod
+    def rand(seed, offset):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"tl.rand({seed}, {offset})"
+
+    @staticmethod
+    def randn(seed, offset):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"tl.randn({seed}, {offset})"
+
+    @staticmethod
+    def randint64(seed, offset, low, high):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"triton_helpers.randint64({seed}, {offset}, {low}, {high})"
+
+    @staticmethod
+    def load_seed(name, offset):
+        raise NotImplementedError("ops.load_seed not implemented outside a kernel")
+
+    @staticmethod
+    def rsqrt(x):
+        return f"libdevice.rsqrt({x})"
+
+    @staticmethod
+    def log1p(x):
+        return f"libdevice.log1p({x})"
+
+    @staticmethod
+    def tan(x):
+        return f"libdevice.tan({x})"
+
+    @staticmethod
+    def tanh(x):
+        return f"libdevice.tanh({x})"
+
+    @staticmethod
+    def sigmoid(x):
+        return f"tl.sigmoid({x})"
+
+    @staticmethod
+    def libdevice_sigmoid(x):
+        return f"1/(1 + libdevice.exp(-({x})))"
+
+    @staticmethod
+    def signbit(x):
+        # XX: This is wrong for the value -0.0 in floating point
+        return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+
+    @staticmethod
+    def fmod(a, b):
+        return f"libdevice.fmod({a}, {b})"
+
+    @staticmethod
+    def pow(a, b):
+        return f"libdevice.pow({a}, {b})"
+
+    @staticmethod
+    def log(x):
+        return f"tl_math.log({x})"
+
+    @staticmethod
+    def libdevice_log(x):
+        return f"libdevice.log({x})"
+
+    @staticmethod
+    def isinf(x):
+        return f"libdevice.isinf({x}).to(tl.int1)"
+
+    @staticmethod
+    def isnan(x):
+        return f"libdevice.isnan({x}).to(tl.int1)"
+
+    @staticmethod
+    def round(x):
+        return f"libdevice.nearbyint({x})"
+
+    @staticmethod
+    def floor(x):
+        return f"libdevice.floor({x})"
+
+    @staticmethod
+    def floordiv(a, b):
+        # See the comment in lowering.div_mode. a and b are integer type.
+        # Similar to div_floor_kernel_cuda in pytorch core.
+        # Notice that // in triton behaves as truncdiv instead of floordiv
+        quot = f"{a} // {b}"
+        rem = f"{a} % {b}"
+        return f"tl.where(({a} < 0) != ({b} < 0), tl.where({rem} != 0, {quot} - 1, {quot}), {quot})"
+
+    @staticmethod
+    def sign(x):
+        def to_int(s):
+            return f"{s}.to(tl.int8)"
+
+        left = to_int(ops.lt("0", x))
+        right = to_int(ops.lt(x, "0"))
+        sub = ops.sub(left, right)
+        return f"{sub}.to({x}.dtype)"
+
+    @staticmethod
+    def trunc(x):
+        return f"libdevice.trunc({x})"
+
+    @staticmethod
+    def truncdiv(a, b):
+        # See the comment in lowering.div_mode. a and b are integer type.
+        # Notice that // in triton behaves as truncdiv instead of floordiv
+        return f"{a} // {b}"
+
+    @staticmethod
+    def ceil(x):
+        return f"libdevice.ceil({x})"
+
+
+TritonOverrides._initialize_pointwise_overrides("triton")
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonOverrides(h: TritonOverrides) -> OpsHandler[str]:
+    return h
+
+
+class TritonKernelOverrides(TritonOverrides):
+    """Map element-wise ops to Triton within a TritonKernel
+
+    Unlike TritonOverrides, these assume the code is going to be inserted into
+    the body of the main triton kernel and so it may use indexing and mask
+    variables which are assumed to already be defined in the current scope.
+    """
+
+    @classmethod
+    def constant(cls, value, dtype):
+        # NOTE: Cannot use shape=[] as it's not supported by triton-rocm
+        # We could use shape=[1] instead but starting with the correct
+        # ndim avoids extra `tt.expand_dim` ops appearing in the triton IR.
+        ndim = V.kernel.triton_tensor_ndim()
+        shape = [1] * ndim
+        return cls._shaped_constant(value, dtype, shape=shape)
+
+    @classmethod
+    def index_expr(cls, expr, dtype):
+        indexing = V.kernel.indexing(expr, block_ptr=False)
+        assert isinstance(indexing, IndexingOptions)
+        # This is called from CSEProxy.__getattr__,  so we'll set the bounds there
+        var = V.kernel.cse.generate(V.kernel.compute, indexing.index_str)
+
+        if dtype not in {torch.int32, torch.int64}:
+            var = V.kernel.cse.generate(V.kernel.compute, cls.to_dtype(var, dtype))
+        var.mask_vars = indexing.mask_vars
+        return var
+
+    @staticmethod
+    def masked(mask, body, other):
+        with V.kernel.mask_loads(mask) as new_mask:
+            result = body()
+
+        # Take dtype from result to prevent accidental promotion
+        other = V.kernel.cse.generate(
+            V.kernel.compute,
+            f"tl.full({result}.shape, {triton_constant(other)}, {result}.dtype)",
+        )
+        return ops.where(new_mask, result, other)
+
+    @staticmethod
+    def load_seed(name, offset):
+        var = V.kernel.args.input(name)
+        return (
+            f"tl.load({var} + {V.kernel.args.seed_offset('load_seed_offset', offset)})"
+        )
+
+    @staticmethod
+    def frexp(x):
+        cache_key = f"frexp({x})"
+        if cache_key in V.kernel.cse.cache:
+            return V.kernel.cse.cache[cache_key]
+
+        mantissa = V.kernel.cse.newvar()
+        exponent = V.kernel.cse.newvar()
+        V.kernel.compute.writeline(
+            f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
+        )
+        V.kernel.cse.cache[cache_key] = (mantissa, exponent)
+        return (mantissa, exponent)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonKernelOverrides(h: TritonKernelOverrides) -> OpsHandler[str]:
+    return h
+
+
+@dataclasses.dataclass
+class IterationRanges:
+    """
+    Each range tree represents multiple sets of iteration indexing
+    in a single tiled dimension in the output kernel.
+
+    If you have two loops ranges one (4, 3, 2) and another (4, 6),
+    then the range tree will be:
+            4 (i0)
+        3 (i1)  6 (i3)
+        2 (i2)
+    Where i0 is shared between both loops, but then the split into
+    different indexing vars.  All loop ranges must iterate over
+    the same number of elements.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        var_list: List[sympy.Symbol],
+        var_ranges: Dict[sympy.Symbol, sympy.Expr],
+        numel: sympy.Expr,
+        prefix: str,
+        *,
+        kernel: TritonKernel,
+        divisor=sympy.Integer(1),
+        length=sympy.Integer(1),
+        root: IterationRangesRoot,
+    ):
+        super().__init__()
+        self.name = name
+        self.var_list = var_list
+        self.var_ranges = var_ranges
+        self.numel = numel
+        self.prefix = prefix
+        self.divisor = divisor
+        self.length = length
+        self.kernel = kernel
+        self.root = root
+
+    def symbol(self):
+        return sympy_index_symbol(self.name)
+
+
+class IterationRangesRoot(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        numel: sympy.Expr,
+        prefix: str,
+        index: int,
+        kernel: TritonKernel,
+        pid_cache=None,
+        *,
+        is_loop: bool,
+        tensor_dim: Optional[int],
+        grid_dim: Optional[int],
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__(
+            name=name,
+            var_list=[],
+            var_ranges={},
+            numel=numel,
+            prefix=prefix,
+            kernel=kernel,
+            root=self,
+        )
+        self.index = index
+        # Store all the nodes in one flat list
+        self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {}
+        # This is for re-ordering program ID in triton mm template
+        # pid_cache["tl.program_id(0)"] = pid_m
+        self.pid_cache: Dict[str, str] = pid_cache
+
+        # True if the dimension is implemented as a single program looping over
+        # the full dimension (currently only used for non-persistent reduction)
+        assert not is_loop or (prefix == "r" and grid_dim is None)
+        self.is_loop = is_loop
+        # Index of corresponding dimension on triton tensors
+        self.tensor_dim = tensor_dim
+        # Index of corresponding dimension in the triton grid
+        self.grid_dim = grid_dim
+
+    def __repr__(self):
+        return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)"
+
+    def cache_clear(self):
+        for node in self.nodes.values():
+            node.cache_clear()
+
+    def lookup(self, divisor, length):
+        """
+        Lookup a given RangeTreeEntry, creating it if needed
+        """
+        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
+        else:
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
+
+        if expr not in self.nodes:
+            node = IterationRangesEntry(
+                f"{self.prefix}{next(V.kernel.iter_vars_count)}",
+                divisor,
+                length,
+                expr,
+                self,
+            )
+            V.kernel.range_tree_nodes[node.symbol()] = node
+            self.var_list.append(node.symbol())
+            self.var_ranges[node.symbol()] = length
+            self.nodes[expr] = node
+        return self.nodes[expr]
+
+    def construct_entries(self, lengths: List[sympy.Expr]):
+        divisor = sympy.Integer(1)
+        itervars = []
+        for length in reversed(lengths):
+            itervars.append(self.lookup(divisor, length))
+            divisor = divisor * length
+        return list(reversed(itervars))
+
+    def construct(self, lengths: List[sympy.Expr]):
+        return [e.symbol() for e in self.construct_entries(lengths)]
+
+    def vars_and_sizes(self, index: sympy.Expr):
+        """Figure out vars from this tree used in index"""
+        nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
+        nodes = [n for n in nodes if n and n.prefix == self.prefix]
+        nodes.sort(key=lambda x: V.graph.sizevars.size_hint(x.divisor))
+        divisor = sympy.Integer(1)
+        index_vars = []
+        sizes = []
+
+        def add(node):
+            nonlocal divisor
+            index_vars.append(node.symbol())
+            sizes.append(node.length)
+            divisor = divisor * node.length
+
+        for node in nodes:
+            if not V.graph.sizevars.statically_known_equals(node.divisor, divisor):
+                # fill in unused index var
+                add(self.lookup(divisor, FloorDiv(node.divisor, divisor)))
+                divisor = node.divisor
+            add(node)
+        if not V.graph.sizevars.statically_known_equals(self.numel, divisor):
+            # fill in unused index var
+            add(self.lookup(divisor, FloorDiv(self.numel, divisor)))
+
+        return list(reversed(index_vars)), list(reversed(sizes))
+
+    def ranges_code(self):
+        assert self.tensor_dim is not None
+        size = self.kernel.indexing_size_str(self.tensor_dim)
+        index_dtype = self.kernel.index_dtype
+        convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
+        return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}"
+
+    def scalar_code(self, value):
+        index_dtype = self.kernel.index_dtype
+        ndim = self.kernel.triton_tensor_ndim()
+        size = [1] * ndim
+        return f"tl.full({size}, {value}, {index_dtype})"
+
+    def get_pid(self):
+        assert self.grid_dim is not None
+        key = f"tl.program_id({self.grid_dim})"
+        # y_grid has a limit, so express it in terms of y and z in case of overflow.
+        # z grid is only exercised when max_tiles == 3 (off by default).
+        if (
+            self.grid_dim == 1
+            and config.triton.max_tiles <= 2
+            and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid())
+        ):
+            key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)"
+        pid = self.pid_cache.get(key, key)
+        if self.kernel.index_dtype != "tl.int32":
+            return f"{pid}.to({self.kernel.index_dtype})"
+        return pid
+
+    def codegen_header(self, code):
+        x = self.prefix
+        if self.is_loop:
+            code.writeline(f"{self.name} = {x}offset + {x}base")
+        elif self.grid_dim is None:
+            # no need to "{x}offset = "
+            code.writeline(f"{self.name} = {self.ranges_code()}")
+            code.writeline(f"{x}offset = 0")
+        else:
+            if self.tensor_dim is not None:
+                line = f"{x}offset + {self.ranges_code()}"
+            else:
+                line = self.scalar_code(f"{x}offset")
+            code.writelines(
+                [
+                    f"{x}offset = {self.get_pid()} * {x.upper()}BLOCK",
+                    f"{self.name} = {line}",
+                ]
+            )
+        code.writeline(f"{x}mask = {self.name} < {x}numel")
+
+
+class IterationRangesEntry(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        divisor: sympy.Expr,
+        length: sympy.Expr,
+        expr: sympy.Expr,
+        parent: IterationRanges,
+    ):
+        super().__init__(
+            name=name,
+            numel=parent.numel / length,
+            var_list=parent.var_list,
+            var_ranges=parent.var_ranges,
+            prefix=parent.prefix,
+            divisor=divisor,
+            length=length,
+            kernel=parent.kernel,
+            root=parent.root,
+        )
+        self.parent = parent
+        self.codegen = functools.lru_cache(None)(self._codegen)
+        self.expr = expr
+
+    def __repr__(self):
+        return f"IterationRangesEntry({self.name}, {self.divisor}, {self.length}, {self.expr}, {self.var_ranges})"
+
+    def set_name(self, name):
+        self.codegen = lambda: name  # type: ignore[assignment]
+        self.codegen.cache_clear = lambda: None  # type: ignore[method-assign]
+        self.name = name
+
+    def cache_clear(self):
+        self.codegen.cache_clear()
+
+    def writeline(self, line):
+        if self.root.is_loop:
+            V.kernel.indexing_code.writeline(line)
+        else:
+            # lift non-reduction stores outside loop
+            V.kernel.body.writeline(line)
+
+    def _codegen(self):
+        self.writeline(f"{self.name} = " + texpr(V.kernel.rename_indexing(self.expr)))
+        return self.name
+
+    def precomputed_args(self):
+        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
+        precomputed_args: List[sympy.Expr] = []
+        if isinstance(self.expr, sympy.Symbol):
+            return precomputed_args
+        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
+        for arg in self.expr.args[1:]:
+            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
+                symbols = arg.free_symbols
+                if len(symbols) > 0 and all(s.name.startswith("s") for s in symbols):
+                    precomputed_args.append(arg)
+        return precomputed_args
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+
+class HelperFunctions:
+    """An ordered set of helper functions."""
+
+    _templates_seen: Dict[str, str]  # Template code to function name
+    finalized_helpers: List[str]
+
+    def __init__(self):
+        self._templates_seen = {}
+        self.finalized_helpers = []
+
+    def add(self, template_code: str) -> str:
+        """This accepts a function definition with the function name
+        left as a format specifier e.g.
+
+            @triton.jit
+            def {name}(arg0, arg1):
+                return arg0 + arg1
+
+        We add the templated code to the function set and return the name
+        assigned to that function.
+
+        """
+        existing_name = self._templates_seen.get(template_code)
+        if existing_name is not None:
+            # Don't duplicate existing helpers
+            return existing_name
+
+        name = f"_triton_helper_fn{len(self.finalized_helpers)}"
+        self._templates_seen[template_code] = name
+        self.finalized_helpers.append(template_code.format(name=name))
+        return name
+
+    def __iter__(self):
+        return iter(self.finalized_helpers)
+
+    def __getitem__(self, idx):
+        return self.finalized_helpers[idx]
+
+
+class TritonKernel(Kernel):
+    overrides = TritonKernelOverrides  # type: ignore[assignment]
+    sexpr = pexpr
+
+    helper_functions: HelperFunctions
+
+    def __init__(
+        self,
+        *groups,
+        index_dtype: str,
+        mutations: Optional[Set[str]] = None,
+        pid_cache=None,
+        reduction_hint=ReductionHint.DEFAULT,
+        min_elem_per_thread=0,
+        disable_persistent_reduction=False,
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__()
+        self.numels = [V.graph.sizevars.simplify(s) for s in groups]
+        self.mutations: Set[str] = mutations if mutations is not None else set()
+        self.range_trees: List[IterationRangesRoot] = []
+        self.range_tree_nodes: Dict[sympy.Symbol, IterationRangesEntry] = {}
+        self.iter_vars_count = itertools.count()
+        self.inside_reduction = self.numels[-1] != 1
+        self.body = IndentedBuffer()
+        self.indexing_code = IndentedBuffer()
+        self.suffix: IndentedBuffer = IndentedBuffer()  # type: ignore[assignment]
+        self.outside_loop_vars: Set[Any] = set()
+        self.reduction_hint = reduction_hint
+        self.index_dtype: str = index_dtype
+        self.min_elem_per_thread = min_elem_per_thread
+        self.last_usage: Set[str] = set()
+        self.block_ptr_id = itertools.count()
+        # buffer accesses in the kernel
+        self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
+
+        self.persistent_reduction: bool = (
+            not disable_persistent_reduction
+        ) and self.should_use_persistent_reduction()
+        self.no_x_dim = (
+            self.reduction_hint == ReductionHint.INNER
+            and self.persistent_reduction
+            and len(self.numels) == 2
+            and self.numels[-1] >= 256
+        )
+        self.initialize_range_tree(pid_cache)
+
+        self.helper_functions = HelperFunctions()
+
+        # A set of autotuning hints to pass as part of triton_meta
+        self.autotune_hints: Set[AutotuneHint] = set()
+
+        # define this in a closure to make cache local to object
+        @functools.lru_cache(None)
+        def simplify_indexing(index: sympy.Expr):
+            index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges())
+            for tree in self.range_trees:
+                index = self.combine_contiguous_dims(index, tree)
+            return index
+
+        self.simplify_indexing = simplify_indexing
+        self.code_hash = None
+        self.triton_meta: Optional[Dict[str, object]] = None
+
+    def need_numel_args(self):
+        r"""
+        Indicate whether we need provide numel as arguments for the generated
+        kernel calls in the benchmark.
+
+        Should be true for pointwise/reduction kernels but false for triton
+        matmul kernels.
+        """
+        return True
+
+    def should_use_persistent_reduction(self) -> bool:
+        """
+        Heuristic to set self.persistent_reduction and add guards
+        if needed.
+        """
+        if not (self.inside_reduction and config.triton.persistent_reductions):
+            return False
+        threshold = {
+            ReductionHint.INNER: 1024,
+        }.get(self.reduction_hint, 64)
+
+        # If multi_kernel is enabled, we do more aggressive persistent reduction.
+        # This may result in some persisent reductions slower than the
+        # corresponding non-persistent reductions. MultiKernel will do benchmarking
+        # to pick the faster one.
+        if config.triton.multi_kernel:
+            threshold *= 16
+        last_numel = self.numels[-1]
+        if not isinstance(last_numel, (int, sympy.Integer)):
+            # Not static
+            return False
+        hint = V.graph.sizevars.size_hint(last_numel)
+        if hint > threshold:
+            return False
+        # will need to recompile if we cross a larger power of 2 boundary
+        V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))  # type: ignore[arg-type]
+        return True
+
+    def set_last_usage(self, nodes):
+        if not self.inside_reduction or self.persistent_reduction:
+            return
+        self.last_usage = set(
+            itertools.chain.from_iterable(
+                n.last_usage for n in nodes if n is not EnableReduction
+            )
+        )
+
+    def initialize_range_tree(self, pid_cache):
+        no_r_dim = not self.inside_reduction or self.numels[-1] == 1
+
+        prefixes = "zyxr"
+        active_prefixes = prefixes[-len(self.numels) :]
+
+        grid_dims = "xyz"
+        if self.no_x_dim:
+            tensor_dims = "r"
+        elif no_r_dim:
+            tensor_dims = "xyz"
+        else:
+            tensor_dims = "xyzr"
+
+        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
+
+        for i, prefix in enumerate(active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
+            grid_dim = None if is_reduction else grid_dims.find(prefix)
+            index = i if grid_dim is None else grid_dim
+            self.range_trees.append(
+                IterationRangesRoot(
+                    f"{prefix}index",
+                    self.numels[i],
+                    prefix,
+                    index,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=is_reduction and not self.persistent_reduction,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                )
+            )
+        for tree in self.range_trees:
+            # reduction indexing goes inside a loop
+            if not tree.is_loop:
+                tree.codegen_header(self.body)
+        if self.inside_reduction and self.range_trees[-1].is_loop:
+            # workaround for this issue:
+            # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7
+            self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}")
+
+    def disable_reduction(self):
+        should_flush = self.range_trees[-1].is_loop
+
+        @contextlib.contextmanager
+        def ctx():
+            if self.numels[-1] == 1:
+                assert not self.inside_reduction
+                yield
+                return
+            if should_flush:
+                # calling codegen_body() will flush all the pending buffers
+                # and write out a reduction loop
+                self.codegen_body()
+            self.inside_reduction = False
+            try:
+                yield
+                if should_flush:
+                    # flush out any code before opening the next loop
+                    self.codegen_body()
+            finally:
+                self.inside_reduction = True
+
+        return ctx()
+
+    def set_ranges(self, *lengths):
+        assert len(lengths) == len(self.range_trees)
+        return [
+            ranges.construct(length)
+            for length, ranges in zip(lengths, self.range_trees)
+        ]
+
+    @staticmethod
+    def _split_iteration_ranges(
+        groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        sv = V.graph.sizevars
+        new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
+        remaining = [sv.simplify(g) for g in groups]
+        var_count = itertools.count()
+
+        def add_range(i, expr):
+            expr = sv.simplify(expr)
+            if not sv.statically_known_multiple_of(remaining[i], expr):
+                raise CantSplit()
+            # guard on the last item out
+            remaining[i] = FloorDiv(remaining[i], expr)
+            new_ranges[i].append(expr)
+            return next(var_count)
+
+        def make_combined(size, idx1, idx2):
+            def getter(flat_vars):
+                return size * flat_vars[idx1] + flat_vars[idx2]
+
+            return getter
+
+        return_getters_groups = []
+        current_group = 0
+        for length_group in lengths:
+            return_getters = []
+            for size in length_group:
+                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
+                    return_getters.append(lambda _: sympy.Integer(0))
+                    continue
+
+                while (
+                    current_group < len(remaining)
+                    and sv.size_hint(remaining[current_group]) == 1
+                ):
+                    # scroll to next group with remaining elements
+                    current_group += 1
+
+                if sv.size_hint(size) > sv.size_hint(remaining[current_group]):
+                    # need to break size in two
+                    if not sv.statically_known_multiple_of(
+                        size, remaining[current_group]
+                    ):
+                        raise CantSplit()
+                    size1 = remaining[current_group]
+                    size2 = FloorDiv(size, remaining[current_group])
+                    return_getters.append(
+                        make_combined(
+                            size2,
+                            add_range(current_group, size1),
+                            add_range(current_group + 1, size2),
+                        )
+                    )
+                else:
+                    return_getters.append(
+                        operator.itemgetter(add_range(current_group, size))
+                    )
+            return_getters_groups.append(return_getters)
+
+        assert all(
+            V.graph.sizevars.size_hint(s) == 1 for s in remaining
+        ), f"failed to set ranges {remaining} {lengths}"
+
+        return new_ranges, return_getters_groups
+
+    @classmethod
+    def is_compatible(
+        cls, groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        try:
+            cls._split_iteration_ranges(groups, lengths)
+            return True
+        except CantSplit:
+            return False
+
+    def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]):
+        """
+        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).
+
+        To do this we need to split up the iteration space of i0 into something like:
+            for i1 in s0:
+              for i2 in s1:
+                i0 = i1*s1 + i2
+                ....
+
+        This function matches and resplits lengths to the groups of
+        this kernel to enable tiled + non-tiled fusions.
+        """
+        groups = [rt.numel for rt in self.range_trees]
+        if not self.inside_reduction:
+            groups[-1] = sympy.Integer(1)
+
+        if len(lengths) == len(self.range_trees) and all(
+            V.graph.sizevars.simplify(sympy_product(x) - g) == 0
+            for x, g in zip(lengths, groups)
+        ):
+            return self.set_ranges(*lengths)
+
+        new_ranges, return_getters_groups = self._split_iteration_ranges(
+            groups, lengths
+        )
+        itervars = list(itertools.chain.from_iterable(self.set_ranges(*new_ranges)))
+        return [[fn(itervars) for fn in fns] for fns in return_getters_groups]
+
+    def is_indirect_indexing(self, index: sympy.Expr):
+        # tmpX  means indirect indexing
+        return free_symbol_startswith(index, "tmp")
+
+    def is_broadcasted(self, index: sympy.Expr):
+        # Note. This may not be correct when there is indirect indexing
+        if self.is_indirect_indexing(index):
+            return False
+
+        index_numels = [1] * len(self.numels)
+        for symbol in index.free_symbols:
+            if symbol not in self.range_tree_nodes:
+                # Non-iterated variables, e.g. strides
+                continue
+            entry = self.range_tree_nodes[symbol]  # type: ignore[index]
+            assert isinstance(entry.parent, IterationRangesRoot)
+            index_numels[entry.parent.index] *= entry.length
+
+        # If the index variables only iterate over a subset of the kernel
+        # numels, then it must be broadcasted.
+        simplify = V.graph.sizevars.simplify
+        return any(
+            simplify(idx_range) != simplify(iter_range)  # type: ignore[arg-type]
+            for idx_range, iter_range in zip(index_numels, self.numels)
+        )
+
+    def combine_contiguous_dims(self, index: sympy.Expr, tree: IterationRangesRoot):
+        """
+        More aggressive simplification to merge contiguous dims
+        """
+        if isinstance(index, (sympy.Integer, sympy.Symbol)):
+            return index
+        index_vars, sizes = tree.vars_and_sizes(index)
+        if len(sizes) <= 1:
+            return index
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars, sizes, index_prevent_reordering([index], index_vars, sizes)
+        )
+        if new_sizes == sizes:
+            return index
+        new_index_vars = tree.construct(new_sizes)
+        new_index = sympy_subs(index, dict(zip(index_vars, reindex(new_index_vars))))
+        return new_index
+
+    def index_to_str(self, index: sympy.Expr) -> str:
+        """
+        Convert an index expr to a string that can be used in triton code.
+        e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel.
+
+        Index expressions often need to be passed in as arguments to the triton kernel.
+        Rename_indexing and codegen_indexing keep track of the needed indices and add
+        new parameters to the function signature.
+        """
+        if isinstance(index, list):
+            return f"[{', '.join(map(self.index_to_str, index))}]"
+        return texpr(self.rename_indexing(self.codegen_indexing(index)))
+
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        copy_shape=None,
+        dense_indexing=False,
+        override_mask=None,
+        block_ptr=False,
+    ) -> Union[IndexingOptions, BlockPtrOptions]:
+        """
+        Compute the index and mask to pass to tl.load() or tl.store()
+        """
+        index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
+        # last resort, if no range vars are in the expr, hoist it
+        # TODO instead of trying to blindly find complicated exprs, we should hoist the
+        # inputs/outputs sizes and strides, but at the time indexing is generated
+        # kernel inputs and outputs are not set yet, we'd need a deeper refactor
+        # to do it this way
+
+        if len(index.atoms(sympy.ceiling)):
+            for a in index.atoms(sympy.ceiling):
+                # for nested exprs, atoms yields top level first (?)
+                # so if everything goes fine, lower level replacements will come up empty
+                symbols = a.free_symbols
+                if len(symbols) > 0 and all(
+                    s.name.startswith("s") or s.name.startswith("ps") for s in symbols
+                ):
+                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
+                    index = sympy_subs(index, replacements)
+
+        index = self.simplify_indexing(index)
+        index_vars = index.free_symbols
+        has_rindex = False
+
+        mask_vars: Set[str] = set()
+        for var in index_vars:
+            assert isinstance(var, sympy.Symbol)
+            has_rindex = has_rindex or var.name.startswith("r")
+            if override_mask:
+                pass
+            elif var.name.startswith("tmp"):
+                # indirect indexing
+                cse_var = self.cse.varname_map[var.name]
+                mask_vars.update(cse_var.mask_vars)
+            elif var.name.startswith(("s", "ps", "i", "u")):
+                pass
+            else:
+                # var is one of xN, yN or rN
+                assert var.name[0] in "xyr", var.name
+                mask_vars.add(f"{var.name[0]}mask")
+
+        need_dense = (
+            config.triton.dense_indexing
+            or dense_indexing
+            or self._load_mask is not None
+        ) and index != 0
+
+        have_dense = True
+        have_loop_vars = False
+        dense_mask_vars = set()
+
+        for tree in self.active_range_trees():
+            if index_vars.intersection(tree.var_list):
+                have_loop_vars = True
+            else:
+                have_dense = False
+            dense_mask_vars.add(f"{tree.prefix}mask")
+
+        if (
+            block_ptr
+            and config.triton.use_block_ptr
+            and not override_mask
+            and not self._load_mask
+            and len(mask_vars - dense_mask_vars) == 0
+            and not self.is_indirect_indexing(index)
+            and have_loop_vars
+            # workaround https://github.com/openai/triton/issues/2821
+            and self.index_dtype == "tl.int32"
+        ):
+            index_relative_to_xyr_index = sympy_subs(
+                index, {v: t.expr for v, t in self.range_tree_nodes.items()}
+            )
+            range_trees = self.active_range_trees(reorder=True)
+            symbols = [t.symbol() for t in range_trees]
+            strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols]
+            offset = sympy.Wild("_offset", exclude=symbols)
+            m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset)
+            # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with
+            #               a tl.reshape the correct block.  We will miss these cases today.
+            if m:
+                self.filter_masks(mask_vars)
+                return BlockPtrOptions.create(
+                    [m[s] for s in strides],
+                    m[offset],
+                    range_trees,
+                    mask_vars,  # type: ignore[arg-type]
+                )
+
+        expand_str = None
+        index_str = self.index_to_str(index)
+        if isinstance(index, sympy.Integer):
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
+            return IndexingOptions(index_str, set(), "None", expand_str, has_rindex)
+
+        if need_dense and not have_dense:
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+            mask_vars = dense_mask_vars
+        elif not have_loop_vars and copy_shape:
+            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
+            mask_vars = dense_mask_vars
+
+        if override_mask:
+            mask_vars = {override_mask}
+
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        self.filter_masks(mask_vars)
+
+        mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
+        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex)  # type: ignore[arg-type]
+
+    def active_range_trees(self, reorder=False):
+        trees = [
+            t for t in self.range_trees if t.prefix != "r" or self.inside_reduction
+        ]
+        if reorder and len(trees) > 1:
+            count = sum(t.prefix in "xyz" for t in trees)
+            assert "".join(t.prefix for t in trees[:count]) == "zyx"[-count:], [
+                t.prefix for t in trees[:count]
+            ]
+            trees[:count] = reversed(trees[:count])
+        return trees
+
+    def filter_masks(self, mask_vars):
+        for tree in self.range_trees:
+            # Masks are superfluous if we only have one element
+            if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+                continue
+            # Masks are superfluous if numel is a multiple of BLOCK
+            # (We use the fact that BLOCK is required by triton to be a power of 2)
+            if tree.prefix.upper() not in config.triton.max_block:
+                continue
+            max_block = config.triton.max_block[tree.prefix.upper()]
+            # Optional optimization: if block divides numel exactly, we will
+            # never need to do a masked load to handle stragglers at the end.
+            # It's faster to avoid masking at all.  But it is sound to always
+            # mask.
+            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+
+    def var_ranges(self):
+        return dict(
+            itertools.chain.from_iterable(
+                tree.var_ranges.items() for tree in self.range_trees
+            )
+        )
+
+    def codegen_indexing(self, expr: sympy.Expr):
+        expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
+        for sym in sorted(expr.free_symbols, key=str):
+            if sym in self.range_tree_nodes:
+                # if indexing expression is complicated, we precompute it on the host side
+                # and send the result as a kernel argument
+                replacements = {}
+                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
+                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
+                if len(replacements) > 0:
+                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
+                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
+                    )
+                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
+        return expr
+
+    @contextlib.contextmanager
+    def mask_loads(self, mask):
+        """Context manager to add an additional mask to tl.load/store"""
+        prior = self._load_mask
+        if prior:
+            mask = self.cse.generate(self.compute, f"{mask} & {prior}")
+
+        self._load_mask = mask
+        try:
+            # TODO(jansel): do we need a reshape here?
+            yield mask
+        finally:
+            self._load_mask = prior
+
+    def generate_assert(self, check):
+        return torch.version.hip is None and super().generate_assert(check)
+
+    def load_mask(self, var):
+        mask = ""
+        mask_vars = set(var.mask_vars)
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        if mask_vars:
+            mask = (
+                f"{next(iter(mask_vars))}"
+                if len(mask_vars) == 1
+                else f"({' & '.join(str(v) for v in mask_vars)})"
+            )
+        return mask
+
+    @property
+    def assert_function(self) -> str:
+        return "tl.device_assert"
+
+    def get_strides_of_load(self, index: sympy.Expr):
+        """
+        This gets the stride of the index for each of the tiling variables
+        (technically, it does it at index 0)
+
+        For example, if
+        xindex = x0 + 512*x1 + 1024*r0
+        x0 = (xindex//512)
+        x1 = (xindex % 512)
+        r0 = rindex // 1024
+
+        this function would return
+        {xindex: 512, rindex: 1024}
+        """
+        index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()}
+        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)  # type: ignore[arg-type]
+        strides = {}
+        for range_tree in self.range_trees:
+            s = sympy_index_symbol(range_tree.name)
+            strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs(
+                index_in_tile_vars, {s: 0}
+            )
+        return strides
+
+    def codegen_block_ptr(
+        self, name: str, var: str, indexing: BlockPtrOptions, other=""
+    ) -> Tuple[str, Optional[DeferredLine], str]:
+        advance_block_ptr = None
+        check = indexing.boundary_check()
+        if not check:
+            # workaround https://github.com/openai/triton/issues/2813
+            other = ""
+        elif other:
+            assert other == ", other=0.0"
+            other = f", boundary_check={check!r}, padding_option='zero'"
+        else:
+            other = f", boundary_check={check!r}"
+        if (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and indexing.has_rindex()
+        ):
+            block_ptr = f"block_ptr{next(self.block_ptr_id)}"
+            self.body.writeline(
+                DeferredLine(
+                    name, f"{block_ptr} = {indexing.format(var, roffset=False)}"
+                )
+            )
+            advance_block_ptr = DeferredLine(
+                name,
+                f"{block_ptr} = tl.advance({block_ptr}, {indexing.advance_roffset()})",
+            )
+        else:
+            block_ptr = indexing.format(var)
+        return block_ptr, advance_block_ptr, other
+
+    def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
+        # broadcasting is not implicit for block_ptrs
+        value = (
+            f"tl.broadcast_to({value}, {self.index_to_str(indexing.reshape_suffix)})"
+        )
+        # drop any extra size=1 dimensions
+        value = triton_reshape(value, indexing.reshape_suffix, indexing.block_shape)
+        # workaround https://github.com/openai/triton/issues/2814
+        value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
+        return f"tl.store({block_ptr}, {value}{other})"
+
+    def load(self, name: str, index: sympy.Expr):
+        var = self.args.input(name)
+        indirect_indexing = self.is_indirect_indexing(index)
+        original_index = index
+        indexing = self.indexing(index, block_ptr=True)
+        has_rindex = indexing.has_rindex()
+        has_tmpmask = indexing.has_tmpmask()
+
+        # Keep the variable in cache if were going to reuse it. Equiv., if any of the following hold
+        #  1) We are doing broadcasting
+        #  2) It is a non-coalesced load. The intuition is that if it's
+        #  non-coalesced, we will likely load each element multiple times in
+        #  practice.
+        #  3) It will be used later and it won't be CSE'd. Equiv., if all the following hold
+        #   3.1) We are in a reduction loop
+        #   3.2) Its not its last use
+        #   3.3) This load will not be lifted to the body
+        #
+        is_coalesced = any(
+            i == 1 for i in self.get_strides_of_load(original_index).values()
+        )
+        if self.is_broadcasted(original_index):
+            ep = ", eviction_policy='evict_last'"
+        elif not is_coalesced:
+            ep = ", eviction_policy='evict_last'"
+        elif self.inside_reduction and self.range_trees[-1].is_loop:
+            if name in self.args.inplace_buffers:
+                names = set(self.args.inplace_buffers[name].other_names)
+            else:
+                names = {name}
+            last_use = len(names & self.last_usage) > 0
+            evict_last = not last_use and (has_rindex or indirect_indexing)
+            if evict_last:
+                ep = ", eviction_policy='evict_last'"
+            else:
+                ep = ", eviction_policy='evict_first'"
+        else:
+            ep = ""
+        # "other" below is a workaround for https://github.com/openai/triton/issues/737
+        # for bool, even though it's likely subject to the same bug, setting `other` leads
+        # to LLVM errors so we are skipping it for now
+        if (
+            (has_tmpmask or has_rindex)
+            and V.graph.get_dtype(name) != torch.bool
+            and indexing.has_mask()
+        ):
+            other = ", other=0.0"
+        else:
+            other = ""
+
+        advance_block_ptr = None
+        append_broadcast = None
+        if V.graph.is_unspec_arg(name):
+            line = var
+        else:
+            if isinstance(indexing, BlockPtrOptions):
+                block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                    name, var, indexing, other
+                )
+                line = f"tl.load({block_ptr}{other}{ep})"
+                # add needed size=1 dimensions
+                line = triton_reshape(
+                    line, indexing.block_shape, indexing.reshape_suffix
+                )
+            elif isinstance(original_index, sympy.Integer):
+                line = f"tl.load({var} + ({original_index}))"
+                append_broadcast = indexing.expand_str
+            else:
+                line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other})"
+
+            dtype = V.graph.get_dtype(name)
+            if dtype in (torch.float16, torch.bfloat16):
+                line += ".to(tl.float32)"
+            if dtype == torch.bool and torch.version.hip is None:
+                # Workaround for https://github.com/openai/triton/issues/2151
+                # tl.load returns int8 when loading from pointer to int1
+                # NOTE: Currently causes hangs on bool UTs for ROCm
+                line += ".to(tl.int1)"
+
+        if has_tmpmask:
+            # Masked loads must come after the mask is computed
+            load_buffer = self.compute
+        elif (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and not indirect_indexing
+            and not has_rindex
+        ):
+            # can lift a common load outside of reduction loop
+            # One exception is when this is an indirect_load.
+            load_buffer = self.body
+        else:
+            load_buffer = self.loads
+
+        result_var = self.cse.generate(load_buffer, line)
+        assert isinstance(result_var, TritonCSEVariable)
+        result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
+
+        if append_broadcast:
+            line = f"tl.broadcast_to({result_var}, {append_broadcast})"
+            result_var = self.cse.generate(load_buffer, line)
+
+        if advance_block_ptr:
+            load_buffer.writeline(advance_block_ptr)
+
+        if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        var = self.args.output(name)
+        original_index = index
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
+
+        # Guard against write-after-read corruption in triton.
+        # See # https://github.com/openai/triton/issues/1615
+        # This triton bug means that a load which is broadcasted over multiple
+        # warps may see the result of a store that happens later in the triton
+        # program. The workaround is to add a barrier before storing, which
+        # enforces that all warps have already read the data.
+        is_inplace = name in self.args.inplace_buffers
+        is_broadcasted = self.is_broadcasted(original_index)
+        if is_inplace and is_broadcasted:
+            self.stores.writeline(DeferredLine(name, "tl.debug_barrier()"))
+
+        advance_block_ptr = None
+        if isinstance(indexing, BlockPtrOptions):
+            block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                name, var, indexing
+            )
+            # block_ptr stores don't do implicit casting
+            line = self.codegen_block_ptr_store_line(
+                name, indexing, block_ptr, value, other
+            )
+        elif mode is None:
+            line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+        elif mode == "atomic_add":
+            line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+        else:
+            raise NotImplementedError(f"store mode={mode}")
+        self.stores.writeline(DeferredLine(name, line))
+        if advance_block_ptr:
+            self.stores.writeline(advance_block_ptr)
+
+        if not self.inside_reduction:
+            self.outside_loop_vars.add(value)
+
+    def bucketize(
+        self,
+        values: CSEVariable,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ) -> CSEVariable:
+        """
+        See [Note: Inductor bucketize op]
+        """
+
+        # Triton performance for bucketize_binary_search is much better when the number
+        # of threads equals the number of elements.
+        # If we're trying to use a bucketize kernel, we should make sure that an
+        # autotuning config with num_elements_per_warp=32 exists.
+        self.autotune_hints.add(AutotuneHint.ELEMENTS_PER_WARP_32)
+
+        offsets_ptr = self.args.input(offsets_name)
+        block_size = self.dense_size_str()
+        offsets_size_str = self.index_to_str(offsets_size)
+
+        if indexing_dtype == torch.int32:
+            triton_dtype = "tl.int32"
+        elif indexing_dtype == torch.int64:
+            triton_dtype = "tl.int64"
+        else:
+            raise NotImplementedError(
+                "Bucketize only supports indexing with int32 and int64"
+            )
+
+        result = self.cse.generate(
+            self.compute,
+            f"triton_helpers.bucketize_binary_search({values}, {offsets_ptr}, {triton_dtype}, {right}, {offsets_size_str}, {block_size})",  # noqa: B950 line too long
+        )
+
+        return result
+
+    def reduction_resize(self, value):
+        ndims = self.triton_tensor_ndim()
+        if ndims == 1:
+            return f"triton_helpers.promote_to_tensor({value})"
+
+        sizes = [":"] * ndims
+        sizes[-1] = "None"
+        return f"{value}[{', '.join(sizes)}]"
+
+    @staticmethod
+    def _map_tuple_or_scalar(fn, value):
+        if isinstance(value, tuple):
+            return tuple(map(fn, value))
+        return fn(value)
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+        assert self.inside_reduction
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+        reduction_range_prefix = self.range_trees[-1].prefix
+
+        # Say we have
+        #     tmp0 = ops.constant(1, torch.int64)
+        #     tmp1 = ops.reduction(torch.int64, torch.int64, "sum", tmp0)
+        # tmp0 in the triton code is either a scalar, or single-element tensor
+        # so if we emit tl.sum directly, it will only give 1 instead of RBLOCK * 1
+        # To avoid this, we broadcast to the expected shape first.
+        dense_size_str = self.dense_size_str()
+        value = self._map_tuple_or_scalar(
+            lambda v: self.cse.generate(
+                self.compute, f"tl.broadcast_to({v}, {dense_size_str})"
+            ),
+            value,
+        )
+
+        dim: int
+        root_op: str
+
+        def final_reduction(value):
+            use_helper = reduction_type in {"any", "max", "min", "prod"}
+            module = "triton_helpers" if use_helper else "tl"
+            if reduction_type in {"max", "min"}:
+                return self.reduction_resize(
+                    f"{module}.{reduction_type}2({value}, {dim})"
+                )
+            return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
+
+        def final_argreduce(buffer, result_var, value, index):
+            buffer.splice(
+                f"""\
+                _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
+                {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
+                """
+            )
+
+        cache_key = (src_dtype, reduction_type, value)
+        if cache_key in self.cse.reduction_cache:
+            return self.cse.reduction_cache[cache_key]
+
+        dim = self.triton_tensor_ndim() - 1
+        acc_type = triton_acc_type(src_dtype)
+        result_var: Any = self.cse.newvar()
+        result_var.mask_vars = {var for var in masks if var[0] != "r"}
+        cond = " & ".join(masks)
+
+        def where_cond(tval, fval):
+            if not cond:
+                return tval
+            return TritonKernelOverrides.where(cond, tval, fval)
+
+        if self.persistent_reduction:
+            default = ir.Reduction.default_value(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(triton_constant, default)
+
+            def _mask_value(value, default):
+                return self.cse.generate(self.compute, where_cond(value, default))
+
+            if isinstance(value, tuple):
+                masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
+            else:
+                masked_value = _mask_value(value, default)
+
+            if reduction_type in {"argmax", "argmin"}:
+                accumulator_index = str(
+                    self.cse.generate(
+                        self.compute,
+                        f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
+                    )
+                )
+                root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+                final_argreduce(
+                    self.compute, result_var, masked_value, accumulator_index
+                )
+            elif reduction_type == "welford_reduce":
+                # For persistent reductions, don't bother with
+                # welford's algorithm since it uses more registers, and
+                # taking two reductions doesn't increase memory usage.
+                sum_ = ops.reduction(dtype, dtype, "sum", value)
+                self.inside_reduction = False
+                rnumel = ops.index_expr(self.numels[-1], dtype)
+                mean = ops.truediv(sum_, rnumel)
+
+                self.inside_reduction = True
+                dx = ops.sub(value, mean)
+                dx2 = ops.mul(dx, dx)
+                m2 = ops.reduction(dtype, dtype, "sum", dx2)
+                result_var = (mean, m2, rnumel)
+            elif reduction_type == "welford_combine":
+                mean, m2, weight = masked_value
+                welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})"
+                mean, m2, weight = (self.cse.newvar() for _ in range(3))
+                self.compute.writeline(f"{mean}, {m2}, {weight} = {welford}")
+
+                result_var = tuple(
+                    self.cse.generate(self.compute, self.reduction_resize(var_name))
+                    for var_name in (mean, m2, weight)
+                )
+            else:
+                result_var = self.cse.generate(
+                    self.compute, final_reduction(masked_value)
+                )
+        else:
+            accumulator = f"_{result_var}"
+            default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(triton_constant, default)
+            if not isinstance(default, tuple):
+                self.body.writeline(
+                    f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
+                )
+
+            if reduction_type in {"argmax", "argmin"}:
+                accumulator_index = f"_{result_var}_index"
+                long_max = torch.iinfo(torch.int64).max
+                self.body.writeline(
+                    f"{accumulator_index} = tl.full({self.dense_size_str()}, {long_max}, tl.int64)"
+                )
+                root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+
+                self.compute.splice(
+                    f"""\
+                {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index(
+                    {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index
+                )
+                {accumulator} = {where_cond(f'{accumulator}_next', accumulator)}
+                {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)}
+                """
+                )
+                final_argreduce(self.suffix, result_var, accumulator, accumulator_index)
+            elif is_welford_reduction(reduction_type):
+                accumulator = f"{result_var}_mean"
+                accumulator_m2 = f"{result_var}_m2"
+                accumulator_weight = f"{result_var}_weight"
+                self.body.writeline(
+                    f"{accumulator} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+                self.body.writeline(
+                    f"{accumulator_m2} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+                self.body.writeline(
+                    f"{accumulator_weight} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+
+                if reduction_type == "welford_combine":
+                    mean, m2, weight = value
+                    self.compute.splice(
+                        f"""\
+                    {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_combine(
+                        {accumulator}, {accumulator_m2}, {accumulator_weight},
+                        {mean}, {m2}, {weight}
+                    )
+                    """
+                    )
+                else:
+                    assert reduction_type == "welford_reduce"
+                    self.compute.splice(
+                        f"""\
+                    {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_reduce(
+                        {value}, {accumulator}, {accumulator_m2}, {accumulator_weight}, roffset == 0
+                    )
+                    """
+                    )
+
+                self.compute.splice(
+                    f"""\
+                {accumulator} = {where_cond(f'{accumulator}_next', accumulator)}
+                {accumulator_m2} = {where_cond(f'{accumulator_m2}_next', accumulator_m2)}
+                {accumulator_weight} = {where_cond(f'{accumulator_weight}_next', accumulator_weight)}
+                """
+                )
+
+                result_mean = result_var
+                result_m2 = self.cse.newvar()
+                result_weight = self.cse.newvar()
+                self.suffix.splice(
+                    f"""\
+                {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford(
+                    {accumulator}, {accumulator_m2}, {accumulator_weight}, {dim}
+                )
+                {result_mean} = {self.reduction_resize(f'{result_mean}_tmp')}
+                {result_m2} = {self.reduction_resize(f'{result_m2}_tmp')}
+                {result_weight} = {self.reduction_resize(f'{result_weight}_tmp')}
+                """
+                )
+                result_var = result_mean, result_m2, result_weight
+            else:
+                combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
+                updated = combine_fn(accumulator, value)
+                self.compute.writeline(
+                    f"{accumulator} = {where_cond(updated, accumulator)}"
+                )
+
+                if src_dtype == torch.bool:
+                    # This is only really used for aten.any. It changes the
+                    # final reduction of a non-persistent reduction from
+                    #     tmp5 = triton_helpers.max(_tmp5, 1)[:, None]
+                    # to
+                    #     tmp5 = triton_helpers.max(_tmp5.to(tl.int8), 1)[:, None].to(tl.int1)
+                    # which is needed because tl.reduce doesn't support tl.int1
+                    accumulator = f"{accumulator}.to(tl.int8)"
+                    result_type = triton_compute_type(dtype)
+                    self.suffix.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}.to({result_type})"
+                    )
+                else:
+                    self.suffix.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}"
+                    )
+
+        self.cse.reduction_cache[cache_key] = result_var
+
+        if isinstance(result_var, tuple):
+            self.outside_loop_vars |= set(result_var)
+        else:
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        assert self.inside_reduction
+        self.inside_reduction = False
+        indexing = self.indexing(index, block_ptr=True)
+        self.inside_reduction = True
+        var = self.args.output(name)
+
+        if isinstance(indexing, BlockPtrOptions):
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    self.codegen_block_ptr_store_line(
+                        name,
+                        indexing,
+                        indexing.format(var),
+                        value,
+                        f", boundary_check={indexing.boundary_check()!r}",
+                    ),
+                )
+            )
+        else:
+            assert isinstance(indexing, IndexingOptions)
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+                )
+            )
+
+    def _lift_helper(self, fn, num_args) -> str:
+        # Lift IR function into a triton function in the global namespace
+        helper = IndentedBuffer()
+        helper.writeline("@triton.jit")
+        args = [f"arg{n}" for n in range(num_args)]
+        signature = ", ".join(args)
+        helper.writeline(f"def {{name}}({signature}):")
+
+        cse = CSE(prefix="", suffix="")
+        overrides = TritonOverrides(V.MockHandler())
+
+        class CSEProxy:
+            def __getattr__(self, name: str) -> Callable[..., CSEVariable]:
+                def inner(*args, **kwargs):
+                    return cse.generate(
+                        helper,
+                        getattr(overrides, name)(*args, **kwargs),
+                    )
+
+                return inner
+
+        with helper.indent(), V.set_ops_handler(CSEProxy()):
+            outputs = fn(*args)
+            helper.writeline(f"return {outputs}")
+
+        return self.helper_functions.add(helper.getvalue())
+
+    def scan(
+        self,
+        dtype: torch.dtype,
+        combine_fn: Callable[[CSEVariable, CSEVariable], CSEVariable],
+        value: CSEVariable,
+        init: int,
+    ) -> CSEVariable:
+        assert self.inside_reduction
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+        reduction_range_prefix = self.range_trees[-1].prefix
+
+        value = self.cse.generate(
+            self.compute, f"tl.broadcast_to({value}, {self.dense_size_str()})"
+        )
+
+        default = triton_constant(init)
+        dim = self.triton_tensor_ndim() - 1
+        acc_type = triton_acc_type(dtype)
+        cond = " & ".join(masks)
+
+        combine_helper_fn = self._lift_helper(combine_fn, 2)
+
+        def where_cond(value):
+            if not cond:
+                return value
+            default_tensor = self.cse.generate(
+                self.body,
+                f"tl.full({[1] * self.triton_tensor_ndim()}, {default}, {triton_compute_type(dtype)})",
+            )
+            return self.cse.generate(
+                self.compute, f"tl.where({cond}, {value}, {default_tensor})"
+            )
+
+        if self.persistent_reduction:
+            masked_value = where_cond(value)
+            result_var = self.cse.generate(
+                self.compute,
+                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
+            )
+        else:
+            accumulator = self.cse.newvar()
+            reduced_size = self.dense_size_list()
+            reduced_size[-1] = "1"
+            reduced_size = f"[{', '.join(reduced_size)}]"
+
+            self.body.writeline(
+                f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
+            )
+
+            masked_value = where_cond(value)
+            partial_reduce = self.cse.generate(
+                self.compute,
+                self.reduction_resize(
+                    f"tl.reduce({value}, {dim}, {combine_helper_fn})"
+                ),
+            )
+            acc_next = combine_fn(accumulator, partial_reduce)
+            partial_scan = self.cse.generate(
+                self.compute,
+                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
+            )
+            result_var = self.cse.generate(
+                self.compute, combine_fn(accumulator, partial_scan)
+            )
+            self.compute.writeline(f"{accumulator} = {acc_next}")
+
+        result_var.mask_vars = masks  # type: ignore[attr-defined]
+        return result_var
+
+    def codegen_body(self):
+        """
+        Concat output code from index_code, loads, compute, stores,
+        suffix into self.body.
+
+        For pointwise kernels, this is called just once at the end.
+
+        For reduction kernels, this generates a loop over the reduction
+        axis.
+        """
+        if not (
+            self.indexing_code
+            or self.loads
+            or self.stores
+            or self.compute
+            or self.suffix
+        ):
+            return
+
+        if self.inside_reduction and self.range_trees[-1].is_loop:
+            self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
+            with self.body.indent():
+                # last range tree is always reduction
+                self.range_trees[-1].codegen_header(self.body)
+                self.body.splice(self.indexing_code)
+                self.body.splice(self.loads)
+                self.body.splice(self.compute)
+                self.body.splice(self.stores)
+
+            # invalidate any caches that came from inside the reduction loop
+            self.cse.invalidate(self.outside_loop_vars)
+            self.range_trees[-1].cache_clear()
+        else:
+            self.body.splice(self.indexing_code)
+            self.body.splice(self.loads)
+            self.body.splice(self.compute)
+            self.body.splice(self.stores)
+        self.body.splice(self.suffix)
+        self.indexing_code.clear()
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+        self.suffix.clear()
+
+    def codegen_kernel_benchmark(self, num_gb, grid=None):
+        result = IndentedBuffer()
+        argdefs, call_args, signature = self.args.python_argdefs()
+
+        result.writelines(["", "", "def get_args():"])
+        with result.indent():
+            name_cnt = itertools.count()
+            var_names = []
+            for arg_name, arg_sig in zip(call_args, signature):
+                var_name = f"arg_{next(name_cnt)}"
+                buf = V.graph.get_buffer(arg_name)
+                if buf:
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                    )
+                elif arg_name in V.graph.constants:
+                    # note that random seed is put in V.graph.constants
+                    const_tensor = V.graph.constants[arg_name]
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+                    )
+                elif isinstance(arg_sig, SizeArg):
+                    symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
+
+                    # Force the seed_offset to be 0 so calls to the same kernel
+                    # using different seed offset will have the same benchmark harness.
+                    # We can dedup kernel definitions in this case.
+                    if "seed_offset" in arg_sig.name:
+                        symval_hint = 0
+                    result.writeline(f"{var_name} = {symval_hint}")
+                else:
+                    raise KeyError(
+                        f"Don't find the buffer or const tensor for {arg_name}"
+                    )
+                var_names.append(var_name)
+            result.writeline(f"return {', '.join(var_names)},")
+
+        result.writelines(["\n", "\n", "def call(args):"])
+        if grid is None:
+            grid = []
+            extra_args = []
+            extra_args_str = None
+            for tree in self.active_range_trees():
+                expr = pexpr(V.graph.sizevars.size_hint(tree.numel))
+                extra_args.append(expr)
+                if tree.prefix != "r":
+                    grid.append(expr)
+            if self.need_numel_args():
+                extra_args_str = ", ".join(map(str, extra_args)) + ", "
+            else:
+                extra_args_str = ""
+            grid_arg = f"{extra_args_str}grid=grid({', '.join(grid)})"
+        else:
+            grid_arg = f"grid={grid}"
+        index = V.graph.scheduler.current_device.index
+        with result.indent():
+            result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
+            with result.indent():
+                result.writeline(
+                    V.graph.device_ops.set_device(index)
+                )  # no-op to ensure context
+                stream_name = f"stream{index}"
+                result.writeline(f"{stream_name} = get_raw_stream({index})")
+                result.writeline(
+                    f"{str(Placeholder.KERNEL_NAME)}.run(*args, {grid_arg}, stream={stream_name})"
+                )
+
+        # benchmark all configs
+        result.writelines(["\n", "\n", "def benchmark_all_configs(args):"])
+        with result.indent():
+            result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
+            with result.indent():
+                result.writeline(
+                    V.graph.device_ops.set_device(index)
+                )  # no-op to ensure context
+                result.writeline(
+                    f"return {str(Placeholder.KERNEL_NAME)}.benchmark_all_configs(*args, {grid_arg})"
+                )
+
+        result.writelines(["\n", "\n", "if __name__ == '__main__':"])
+        with result.indent():
+            result.writeline("from triton.testing import do_bench")
+            result.writeline("")
+
+            result.writeline("args = get_args()")
+            result.writeline(
+                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
+            )
+            result.writeline(f"num_gb = {num_gb}")
+            result.writeline("gb_per_s = num_gb / (ms / 1e3)")
+            result.writeline(
+                'print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")'
+            )
+
+        return result
+
+    def imports_for_benchmark_kernel(self):
+        return textwrap.dedent(
+            """
+            from torch._dynamo.testing import rand_strided
+            {}
+            import torch
+            from torch._inductor.triton_heuristics import grid, split_scan_grid
+        """.format(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+        )
+
+    def estimate_kernel_num_bytes(self):
+        """
+        Try the best to estimate the total size (in bytes) of the
+        kernel's inputs and outputs, which is used for estimating the memory
+        throughput of this kernel. This information is used for checking how
+        far we are from the peak memory bandwidth. It's important that
+        we want to avoid overestimating the sizes of the inputs and outputs,
+        because it can wrongfully give us a very large memory traffic value,
+        which may be even larger than the theoretical bandwidth and thus
+        become very misleading. This is particularly problematic for cases
+        where we slice some inputs. In those cases, we should only count
+        the size of the "slices" instead of the original inputs, because
+        only the slices contribute to the real memory traffic.
+        """
+        nbytes = []
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        _, call_args, _ = self.args.python_argdefs()
+
+        # For pointwise and reduction kernels, this is the upper-bound numels
+        # for the output buffer.
+        # FIXME: This is not exactly right for cases like below:
+        #    def foo(tensor0, tensor1):
+        #        x0 = narrow(tensor0)
+        #        return cat(x0, tensor1)
+        # For this example, we will end up overestimate the size for the
+        # slice s0. Potentially, we could have precise inputs information
+        # if we maintained the original inputs of the Pointwise kernel created
+        # for the "cat". However, I think it might be a bit overwhelming that
+        # we add such complexity only for handling some particular cases for
+        # benchmarking.
+        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels))
+        for i, arg in enumerate(call_args):
+            # "buf" may be narrowed. In this case, the number of memory accesses
+            # should be estimated based on the reinterpreted layout.
+            # On the other hand, buf may be broadcasted. In this case,
+            # counting the size of the underline storage would give us
+            # a better estimation in terms of memory accesses.
+            if arg not in self.buf_accesses:
+                nbytes.append(0)
+                continue
+            arg_numel = V.graph.get_numel(arg)
+            buf_size = V.graph.sizevars.size_hint(arg_numel)
+            if buf_size > out_numel:
+                # This arg points to a buf that has been sliced.
+                # We need to count each individual slice to have
+                # a better estimation.
+                indices: Set[Any] = set()
+                no_index_dep_count = 0
+                for dep in self.buf_accesses[arg]:
+                    if isinstance(dep, (StarDep, WeakDep)):
+                        indices.add(f"no_index_dep_{no_index_dep_count}")
+                        no_index_dep_count += 1
+                    else:
+                        indices.add(dep.index)
+                numel = len(indices) * out_numel
+            else:
+                numel = buf_size
+            dtype = V.graph.get_dtype(arg)
+            dtype_size = get_dtype_size(dtype)
+            nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(nbytes)
+
+    def _get_heuristic(self):
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            return "persistent_reduction"
+        elif self.inside_reduction:
+            return "reduction"
+        return "pointwise"
+
+    def codegen_kernel(self, name=None):
+        code = IndentedBuffer()
+
+        size_hints = []
+        for numel in self.numels:
+            numel_hint = V.graph.sizevars.symbolic_hint(numel)
+            if not isinstance(numel_hint, (int, sympy.Integer)):
+                # This default heuristic hint was picked carefully: it is
+                # large, to ensure that we don't shrink the block size (since
+                # if you don't have many elements, it'd be wasteful to pick a
+                # large block size).  Since we don't know how many elements we
+                # might have, we should be OK with some inefficiency to make
+                # sure we handle the large case well.  8192 is the largest
+                # block size we support, so we pick that.
+                #
+                # If we have a better hint for unbacked SymInts (e.g., because
+                # a user told us, or we are tracking upper bounds) we could
+                # use that here.
+                size_hint = 8192
+            else:
+                size_hint = next_power_of_2(int(numel_hint))
+            size_hints.append(size_hint)
+
+        if not self.inside_reduction:
+            size_hints.pop()
+
+        heuristics = self._get_heuristic()
+
+        if name is None:
+            code.splice(gen_common_triton_imports())
+
+            if config.benchmark_kernel:
+                code.splice(self.imports_for_benchmark_kernel())
+
+        argdefs, _, signature = self.args.python_argdefs()
+        # maps actual expression to SizeArg if it is in sizevars replacements
+        for i, arg in enumerate(signature):
+            if isinstance(arg, SizeArg):
+                # mypy is unhappy about the sympy.Expr
+                # type for the key of the dict below
+                symbol = cast(sympy.Symbol, arg.expr)
+                if symbol in V.graph.sizevars.inv_precomputed_replacements:
+                    signature[i] = SizeArg(
+                        arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
+                    )
+
+        mutated_args = set()
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.add(self.args.input_buffers[mutation])
+            if (
+                mutation in self.args.inplace_buffers
+                and mutation not in V.graph.removed_buffers
+                and mutation not in self.removed_buffers
+            ):
+                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
+            if mutation in self.args.output_buffers:
+                mutated_args.add(self.args.output_buffers[mutation])
+        mutated_args = sorted(mutated_args)
+
+        triton_meta_signature = signature_to_meta(
+            signature, size_dtype=self.index_dtype
+        )
+        triton_meta = {
+            "signature": triton_meta_signature,
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            "constants": {},
+        }
+
+        inductor_meta = {
+            "autotune_hints": set(self.autotune_hints),
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "mutated_arg_names": mutated_args,
+            "no_x_dim": self.no_x_dim,
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+        num_gb = None
+        if config.benchmark_kernel or config.profile_bandwidth:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+
+        for tree in self.active_range_trees():
+            sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
+            signature.append(sizearg)
+            triton_meta_signature[len(argdefs)] = signature_of(
+                sizearg, size_dtype=self.index_dtype
+            )
+            argdefs.append(f"{tree.prefix}numel")
+            # constexpr version causes issues, see
+            # https://github.com/pytorch/torchdynamo/pull/1362
+            # triton_meta["constants"][len(argdefs)] = V.graph.sizevars.size_hint(
+            #     tree.numel
+            # )
+            # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
+        triton_meta["configs"] = [config_of(signature)]
+
+        # Triton compiler includes equal_to_1 args into constants even
+        # when they are not constexpr. otherwise there may be a segfault
+        # during launching the Inductor-compiled Triton kernel.
+        # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+        # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+
+        self.triton_meta = triton_meta
+
+        for tree in self.range_trees:
+            if tree.prefix == "r" and self.persistent_reduction:
+                # RBLOCK for persistent_reduction is defined in codegen_static_numels
+                continue
+            if tree.tensor_dim is None:
+                continue
+            argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr")
+
+        self.codegen_body()
+
+        for helper in self.helper_functions:
+            code.writeline("")
+            code.splice(helper)
+
+        if self.inside_reduction:
+            reduction_hint = self.reduction_hint
+            heuristics_line = f"""
+                @triton_heuristics.{heuristics}(
+                    size_hints={size_hints!r},
+                    reduction_hint={reduction_hint},
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r}
+                )
+                @triton.jit
+            """
+        else:
+            tile_hint = ""
+            if len(size_hints) == 2:
+                if len(signature) == 4:  # input, output and 2 args
+                    tile_hint = "tile_hint=TileHint.SQUARE,"
+                else:
+                    tile_hint = "tile_hint=TileHint.DEFAULT,"
+            heuristics_line = f"""
+                @triton_heuristics.{heuristics}(
+                    size_hints={size_hints!r}, {tile_hint}
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r},
+                    min_elem_per_thread={self.min_elem_per_thread}
+                )
+                @triton.jit
+            """
+        code.splice(heuristics_line)
+        code.writeline(
+            f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):"
+        )
+        with code.indent():
+            self.codegen_static_numels(code)
+            for old, new in self.args.aliases():
+                code.writeline(f"{old} = {new}")
+            code.splice(self.body)
+
+        if config.benchmark_kernel:
+            code.splice(self.codegen_kernel_benchmark(num_gb))
+
+        return code.getvalue()
+
+    def codegen_static_numels(self, code):
+        """
+        We get a small speedup from hard coding numels if they are static.
+
+        This code stomps on the passed-in values by writing an constant to the top of the kernel.
+
+        In a kernel like:
+        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
+
+        We would add
+        xnumel = 4096
+        rnumel = 768
+
+        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
+        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
+        knows that its a static numel, as that you just plop a constant into the kernel.
+        """
+        for tree in self.range_trees:
+            if tree.prefix != "r" or self.inside_reduction:
+                simplified_tree_numel = V.graph.sizevars.simplify(tree.numel)
+                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                    code.writeline(f"{tree.prefix}numel = {int(simplified_tree_numel)}")
+
+            if tree.prefix == "r" and self.persistent_reduction:
+                simplified_tree_numel = V.graph.sizevars.simplify(tree.numel)
+                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                    val = int(simplified_tree_numel)
+                else:
+                    continue
+                val = next_power_of_2(val)
+                code.writeline(f"RBLOCK: tl.constexpr = {val}")
+
+            if tree.prefix == "x" and self.no_x_dim:
+                code.writeline("XBLOCK: tl.constexpr = 1")
+
+    def triton_tensor_ndim(self):
+        return sum(int(tree.tensor_dim is not None) for tree in self.range_trees)
+
+    def indexing_size_str(self, i):
+        sizes = ["None"] * self.triton_tensor_ndim()
+        sizes[i] = ":"
+        return f"[{', '.join(sizes)}]"
+
+    def dense_size_list(self) -> List[str]:
+        sizes = ["1"] * self.triton_tensor_ndim()
+        for tree in self.range_trees:
+            if tree.tensor_dim is None:
+                continue
+
+            if tree.prefix != "r" or self.inside_reduction:
+                sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
+        return sizes
+
+    def dense_size_str(self):
+        sizes = self.dense_size_list()
+        return f"[{', '.join(sizes)}]"
+
+    def _get_grid_fn(self):
+        return "grid"
+
+    def add_numel_to_call_args_and_grid(self, name, call_args, grid):
+        # TODO(jansel): if there are constants, we shouldn't bother passing them as args
+        for tree in self.range_trees:
+            if isinstance(tree.numel, (sympy.Integer, sympy.Symbol)):
+                expr = tree.numel
+            else:
+                expr = V.graph.wrapper_code.generate_numel_expr(name, tree)
+
+            if tree.prefix != "r" or self.inside_reduction:
+                call_args.append(expr)
+            if tree.grid_dim is not None:
+                grid.append(expr)
+
+    def get_call_args(self):
+        _, call_args, _ = self.args.python_argdefs()
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+
+        return call_args
+
+    def call_kernel(self, name: str, node: Optional[IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        call_args = self.get_call_args()
+        grid: List[Any] = []
+        self.add_numel_to_call_args_and_grid(name, call_args, grid)
+        current_device = V.graph.scheduler.current_device
+
+        if self.args.workspace_arg is not None:
+            ws = self.args.workspace_arg
+            wrapper.generate_workspace_allocation(
+                ws.nbytes, current_device, ws.zero_fill
+            )
+
+        grid = wrapper.generate_default_grid(name, grid)
+        wrapper.generate_kernel_call(
+            name,
+            call_args,
+            grid,
+            current_device.index,
+            cuda=True,
+            triton=True,
+            grid_fn=self._get_grid_fn(),
+            triton_meta=self.triton_meta,
+        )
+
+        if self.args.workspace_arg is not None:
+            wrapper.writeline(wrapper.make_free_by_names(["workspace"]))
+
+    def codegen_nan_check(self):
+        wrapper = V.graph.wrapper_code
+        _, call_args, arg_types = self.args.python_argdefs()
+        for arg, arg_type in zip(call_args, arg_types):
+            if isinstance(arg_type, TensorArg):
+                line = f"assert not {arg}.isnan().any().item()"
+                wrapper.writeline(line)
+                line = f"assert not {arg}.isinf().any().item()"
+                wrapper.writeline(line)
+
+    def warn_mix_layout(self, kernel_name):
+        """
+        Print message if the kernel have mixed layout inputs.
+        Only care about 4D tensor for now.
+        """
+        if (
+            len(self.args.input_buffers) == 1
+            and len(self.args.output_buffers) == 1
+            and len(self.args.inplace_buffers) == 0
+        ):
+            # even if input buffer and output buffer have different layout,
+            # this can be a layout conversion kernel. No need to warn for
+            # the mix layouts.
+            return
+
+        argdefs, call_args, signature = self.args.python_argdefs()
+        uniform_stride_order = None
+        for arg_name in call_args:
+            buf = V.graph.get_buffer(arg_name)
+            if buf and len(buf.layout.size) == 4:
+                # ignore the tensor if only 1 dimension is non-zero
+                if len([x for x in buf.layout.size if x == 1]) == 3:
+                    continue
+                stride_order = ir.get_stride_order(buf.layout.stride)
+                if uniform_stride_order is None:
+                    uniform_stride_order = stride_order
+                elif uniform_stride_order != stride_order:
+                    msg = yellow_text(
+                        f"Expected stride order {uniform_stride_order}, but found stride order"
+                        + f" {stride_order} for kernel {kernel_name}"
+                    )
+                    log.warning(msg)
+
+                    stride_order_list = [
+                        ir.get_stride_order(V.graph.get_buffer(name).layout.stride)
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    size_list = [
+                        V.graph.get_buffer(name).layout.size
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    source_list = [
+                        "GraphInput"
+                        if name in V.graph.graph_inputs
+                        else "IntermediateBuffer"
+                        if name in V.graph.name_to_buffer
+                        else None
+                        for name in call_args
+                    ]
+
+                    msg = yellow_text(
+                        f"  param names {argdefs}\n  buf names {call_args}\n  strides {stride_order_list}"
+                        + f"\n  sizes {size_list}\n  sources {source_list}\n"
+                    )
+                    log.warning(msg)
+                    return
+        msg = green_text(
+            f"All the inputs for the triton kernel {kernel_name} have uniform layout"
+        )
+        log.warning(msg)
+
+    def create_cse_var(self, *args, **kwargs):
+        return TritonCSEVariable(*args, **kwargs)
+
+
+class TritonScheduling(BaseScheduling):
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+
+    def group_fn(self, sizes):
+        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
+
+    def can_fuse(self, node1, node2):
+        """
+        Hook called by Scheduler to determine if the Triton backend
+        can fuse node1 and node2.  These nodes might already be
+        FusedSchedulerNodes.
+        """
+        if isinstance(node1, scheduler.ForeachKernelSchedulerNode) or isinstance(
+            node2, scheduler.ForeachKernelSchedulerNode
+        ):
+            return scheduler.ForeachKernelSchedulerNode.can_fuse(node1, node2)
+
+        _, (numel1, rnumel1) = node1.group
+        _, (numel2, rnumel2) = node2.group
+        why = WhyNoFuse(node1, node2)
+
+        if node1.is_split_scan() and not node2.is_split_scan():
+            if node2.is_reduction():
+                why("Split scan cannot fuse with reductions")
+        elif node2.is_split_scan() and not node1.is_split_scan():
+            if node1.is_reduction():
+                why("Split scan cannot fuse with reductions")
+
+        if node1.is_reduction() and node2.is_reduction():
+            reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
+            if not reduction_can_fuse:
+                why(
+                    "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+            return reduction_can_fuse
+
+        if not node1.is_reduction() and not node2.is_reduction():
+            if not (numel1 == numel2 and rnumel1 == rnumel2):
+                why(
+                    "numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+                return False
+
+            if node1.is_template():
+                # Only allow fusion for TritonTemplates for now.
+                # Fusion for CUDATemplates are not supported.
+                is_triton_template = isinstance(node1.node, TritonTemplateBuffer)
+                if not is_triton_template:
+                    why("node1 is not TritonTemplateBuffer")
+                return is_triton_template
+
+            # check for a bad combined tiling
+            tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
+            tiling2 = self.select_tiling(node2.get_nodes(), numel1, rnumel1)
+            tiling3 = self.select_tiling(
+                node1.get_nodes() + node2.get_nodes(), numel1, rnumel1
+            )
+            if config.triton.tiling_prevents_pointwise_fusion:
+                cond = True
+                if len(tiling1) > 2:
+                    if len(tiling2) > 2:
+                        cond = tiling1 == tiling2 == tiling3
+                    else:
+                        cond = tiling1 == tiling3
+                elif len(tiling2) > 2:
+                    cond = tiling2 == tiling3
+                if not cond:
+                    why(
+                        "tiling mismatch (%s, %s, %s)",
+                        tiling1,
+                        tiling2,
+                        tiling3,
+                    )
+                    return False
+
+            return True
+
+        if not node1.is_reduction() and node2.is_reduction():
+            assert rnumel1 == 1 and rnumel2 != 1
+            if numel1 == numel2 * rnumel2:
+                if not all(
+                    TritonKernel.is_compatible((numel2, rnumel2), n.get_ranges())
+                    for n in node1.get_nodes()
+                ):
+                    why("nodes numel/rnumel incompatibility")
+                    return False
+                if (
+                    config.triton.tiling_prevents_reduction_fusion
+                    and not node1.is_template()
+                ):
+                    is_reduction_tiling_valid = self.select_tiling(
+                        node1.get_nodes(), numel1
+                    ) in (
+                        (numel1, 1),
+                        (numel2, rnumel2, 1),
+                    )
+                    if not is_reduction_tiling_valid:
+                        why("invalid tiling for reduction")
+                    return is_reduction_tiling_valid
+                return True
+
+            if numel1 != numel2:
+                why("nodes numel incompatibility")
+            return numel1 == numel2
+
+        assert node1.is_reduction() and not node2.is_reduction()
+        # swap args to hit the case above
+        return self.can_fuse_horizontal(node2, node1)
+
+    can_fuse_vertical = can_fuse
+    can_fuse_horizontal = can_fuse
+
+    def generate_node_schedule(self, nodes, numel, rnumel):
+        node_schedule: List[Any] = []
+        current_loop_writes: Set[str] = set()
+
+        # Writes with a reduced shape, meaning they are only present once the
+        # reduction loop has ended
+        current_loop_reduced_writes = set()
+        current_loop_has_writes = False
+        done = set()
+
+        def fits_in_main_body(n):
+            _, (node_numel, node_rnumel) = n.group
+            return (node_numel == numel and node_rnumel == rnumel) or (
+                node_numel == numel * rnumel and node_rnumel == 1
+            )
+
+        def fits_outside_reduction(n):
+            _, (node_numel, node_rnumel) = n.group
+            return node_numel == numel and node_rnumel == 1 and rnumel != 1
+
+        def schedule_node_in_loop(n):
+            nonlocal current_loop_has_writes
+            done.add(n)
+            node_schedule.append(n)
+            current_loop_has_writes = True
+            # A scan is modelled as a reduction in the scheduler but has a
+            # full sized output that can be used inside the loop body
+            if (
+                n.is_reduction()
+                and isinstance(n, scheduler.SchedulerNode)
+                and isinstance(n.node, ir.ComputedBuffer)
+                and not isinstance(n.node.data, ir.Scan)
+            ):
+                current_loop_reduced_writes.add(n.get_name())
+
+        @contextlib.contextmanager
+        def end_current_reduction_loop():
+            nonlocal current_loop_has_writes
+            if current_loop_has_writes:
+                # flush out any other runnable nodes to reduce number of loops
+                for other_node in nodes[index + 1 :]:
+                    if (
+                        node not in done
+                        and fits_in_main_body(other_node)
+                        and not (current_loop_reduced_writes & other_node.ancestors)
+                    ):
+                        schedule_node_in_loop(node)
+
+            if node_schedule and node_schedule[-1] is EnableReduction:
+                node_schedule.pop()
+            else:
+                node_schedule.append(DisableReduction)
+            yield
+            node_schedule.append(EnableReduction)
+            current_loop_reduced_writes.clear()
+            current_loop_has_writes = False
+
+        for index, node in enumerate(nodes):
+            if node in done:
+                continue
+            done.add(node)
+
+            def requires_closing_previous_reduction(node, node_schedule):
+                if rnumel == 1:
+                    return False
+                if not current_loop_reduced_writes & node.ancestors:
+                    return False
+                assert node_schedule and not isinstance(
+                    node_schedule[-1], (EnableReduction, DisableReduction)
+                )
+                return bool(current_loop_reduced_writes)
+
+            if fits_in_main_body(node):
+                if requires_closing_previous_reduction(node, node_schedule):
+                    with end_current_reduction_loop():
+                        pass  # need to start a new reduction loop
+
+                schedule_node_in_loop(node)
+            elif fits_outside_reduction(node):
+                with end_current_reduction_loop():
+                    node_schedule.append(node)
+            else:
+                raise NotImplementedError(
+                    f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
+                )
+
+        return node_schedule
+
+    def codegen_nodes(self, nodes: List[scheduler.SchedulerNode]):
+        """
+        Given a set of pre-fused nodes, generate a Triton kernel.
+        """
+        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+
+        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+        buf_accesses = collections.defaultdict(list)
+        for node in nodes:
+            for access in node.read_writes.reads | node.read_writes.writes:
+                buf_accesses[access.name].append(access)
+
+        schedule_log.debug("Schedule:\n %s", node_schedule)
+
+        return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel)
+
+    @staticmethod
+    def reduction_hint(node):
+        assert node.is_reduction()
+        if all(
+            dep.is_contiguous()
+            for dep in itertools.chain(node.read_writes.reads, node.read_writes.writes)
+        ):
+            return ReductionHint.INNER
+        else:
+            return node.node.data.reduction_hint
+
+    @staticmethod
+    def can_use_32bit_indexing(
+        numel: sympy.Expr, buffers: Iterable[Union[ir.Buffer, ir.TensorBox]]
+    ) -> bool:
+        int_max = torch.iinfo(torch.int32).max
+        size_hint = V.graph.sizevars.size_hint
+        has_hint = V.graph.sizevars.shape_env.has_hint
+
+        def within_32bit(e):
+            # Allow for unhinted e as long as we can still statically prove
+            # (e.g., via ValueRanges) that it is still in bounds
+            if V.graph.sizevars.is_expr_static_and_true(e <= int_max):
+                return True
+            # Otherwise, the hint MUST exist and be in range
+            return has_hint(e) and size_hint(e) <= int_max
+
+        if not within_32bit(numel):
+            return False
+
+        # Any use of a MultiOutputLayout will create a buffer with a
+        # Layout whose sizes are accounted for
+        buf_sizes = [
+            buf.get_layout().storage_size()
+            for buf in buffers
+            if not isinstance(buf.get_layout(), ir.MultiOutputLayout)
+        ]
+
+        if not all(within_32bit(size) for size in buf_sizes):
+            return False
+
+        # Only install guards for 32-bit indexing as there is no correctness
+        # issue with using 64-bit for everything
+        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
+        for size in buf_sizes:
+            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
+        return True
+
+    @staticmethod
+    def select_index_dtype(node_schedule, numel, reduction_numel):
+        # Gather all used buffer names
+        buffer_names = set()
+        for node in node_schedule:
+            if not isinstance(node, scheduler.BaseSchedulerNode):
+                continue
+
+            buffer_names.update(node.get_names())
+            buffer_names.update(node.used_buffer_names())
+
+        # Get buffers objects
+        def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
+            if name in V.graph.name_to_buffer:
+                return V.graph.name_to_buffer[name]
+            elif name in V.graph.graph_inputs:
+                return V.graph.graph_inputs[name]
+            elif name in V.graph.constants:
+                data = V.graph.constants[name]
+                return ir.ConstantBuffer(
+                    name,
+                    ir.FixedLayout(
+                        data.device, data.dtype, *V.graph.static_sizes_strides(data)
+                    ),
+                )
+            raise RuntimeError(f"Failed to find buffer matching name {name}")
+
+        buffers = [_get_buffer(name) for name in buffer_names]
+
+        # In theory we can separately check xnumel and rnumel are <= int_max
+        # but some indexers do use the full linear index so we need to be
+        # conservative here.
+        total_numel = numel * reduction_numel
+
+        if TritonScheduling.can_use_32bit_indexing(total_numel, buffers):
+            return "tl.int32"
+        return "tl.int64"
+
+    def get_kernel_args(self, node_schedule, numel, reduction_numel):
+        reductions = list(
+            filter(
+                lambda n: n not in (EnableReduction, DisableReduction)
+                and n.is_reduction(),
+                node_schedule,
+            )
+        )
+        if len(reductions) > 0:
+            hints = [self.reduction_hint(n) for n in reductions]
+            if hints.count(hints[0]) == len(hints):
+                reduction_hint_val = hints[0]
+            else:
+                reduction_hint_val = ReductionHint.DEFAULT
+        else:
+            reduction_hint_val = ReductionHint.DEFAULT
+
+        mutations = set()
+        for node in node_schedule:
+            if hasattr(node, "get_mutations"):
+                mutations.update(node.get_mutations())
+
+        index_dtype = self.select_index_dtype(node_schedule, numel, reduction_numel)
+
+        return reduction_hint_val, mutations, index_dtype
+
+    def codegen_comment(self, node_schedule):
+        wrapper = V.graph.wrapper_code
+        origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+        if origins:
+            wrapper.writeline(origins)
+
+        if config.debug_fusion:
+            from torch._inductor.scheduler import (
+                BaseSchedulerNode,
+                ForeachKernelSchedulerNode,
+            )
+
+            if not any(
+                isinstance(n, ForeachKernelSchedulerNode) for n in node_schedule
+            ):
+                # We probably should look what are the nodes inside a foreach
+                # schedule node
+                node_names = [
+                    n.get_name()
+                    for n in node_schedule
+                    if isinstance(n, BaseSchedulerNode)
+                ]
+                wrapper.writeline(
+                    f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
+                )
+
+    def codegen_node_schedule(
+        self, node_schedule, buf_accesses, numel, reduction_numel
+    ):
+        from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
+
+        tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
+        reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+            node_schedule, numel, reduction_numel
+        )
+
+        is_split_scan = any(
+            isinstance(node, BaseSchedulerNode) and node.is_split_scan()
+            for node in node_schedule
+        )
+        kernel_type = TritonSplitScanKernel if is_split_scan else TritonKernel
+        kernel_args = tiled_groups
+        kernel_kwargs = {
+            "reduction_hint": reduction_hint_val,
+            "mutations": mutations,
+            "index_dtype": index_dtype,
+        }
+        kernel = kernel_type(
+            *kernel_args,
+            **kernel_kwargs,
+        )
+        kernel.buf_accesses = buf_accesses
+
+        self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+
+        with V.set_kernel_handler(kernel):
+            src_code = kernel.codegen_kernel()
+
+        kernel_name = self.define_kernel(src_code, node_schedule)
+        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
+        kernel.kernel_name = kernel_name
+        kernel.code_hash = code_hash(src_code)
+
+        if kernel.persistent_reduction and config.triton.multi_kernel:
+            kernel2 = TritonKernel(
+                *kernel_args,
+                **kernel_kwargs,
+                disable_persistent_reduction=True,
+            )
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel2)
+            with V.set_kernel_handler(kernel2):
+                src_code2 = kernel2.codegen_kernel()
+            kernel_name2 = self.define_kernel(src_code2, node_schedule)
+            kernel2.kernel_name = kernel_name2
+            kernel2.code_hash = code_hash(src_code2)
+
+            final_kernel = MultiKernel([kernel, kernel2])
+        else:
+            final_kernel = kernel  # type: ignore[assignment]
+
+        with V.set_kernel_handler(final_kernel):
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.mark_run()
+
+        self.codegen_comment(node_schedule)
+        final_kernel.call_kernel(final_kernel.kernel_name)
+        if config.nan_asserts:
+            final_kernel.codegen_nan_check()
+        if config.warn_mix_layout:
+            final_kernel.warn_mix_layout(kernel_name)
+
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
+
+        if (
+            V.graph.wrapper_code.supports_intermediate_hooks
+            and config.generate_intermediate_hooks
+        ):
+            # Not every node in the schedule will actually be live on output;
+            # we can't check dead buffers.
+            live_outs = kernel.args.live_output_buffers()
+            for node in node_schedule:
+                if not isinstance(node, scheduler.BaseSchedulerNode):
+                    continue
+                name = node.get_name()
+                if name not in live_outs:
+                    continue
+                origin_node = node.node.get_origin_node()
+                if origin_node is not None:
+                    counters["inductor"]["intermediate_hooks"] += 1
+                    V.graph.wrapper_code.writeline(
+                        f"run_intermediate_hooks({origin_node.name!r}, {name})"
+                    )
+
+        self.scheduler.free_buffers()
+
+    def codegen_node_schedule_with_kernel(self, node_schedule, kernel):
+        def current_reduction_nodes(nodes):
+            return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
+
+        with kernel:
+            stack = contextlib.ExitStack()
+            kernel.set_last_usage(current_reduction_nodes(node_schedule))
+
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.decide_inplace_update()
+            for i, node in enumerate(node_schedule):
+                if node is DisableReduction:
+                    stack.enter_context(kernel.disable_reduction())
+                elif node is EnableReduction:
+                    stack.close()
+                    kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
+                else:
+                    # TODO - use split ranges ?
+                    indexing_dtype_strength_reduction(node._body)
+                    index_vars = kernel.split_and_set_ranges(node.get_ranges())
+                    node.codegen(index_vars)
+
+    def define_kernel(self, src_code, node_schedule):
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            kernel_category = get_kernel_category_by_source_code(src_code)[:3]
+            kernel_name = "_".join(
+                ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()]
+            )
+            # use the original src_code as the key
+            wrapper.src_to_kernel[src_code] = kernel_name
+            subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
+
+            # DESCRIPTIVE_NAME is used for profiling purposes; it shows the full kernel name
+            # even when unique_kernel_names is turned off. Meanwhile, KERNEL_NAME is sometimes set
+            # to "triton_" to maximize caching opportunities (when unique_kernel_names = False).
+            src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name)
+
+            # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+            # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+            src_code = src_code.replace("#pragma CMT", "#")
+
+            basename, _, kernel_path = get_path(code_hash(src_code.strip()), "py")
+
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''")
+            compile_wrapper.splice(src_code, strip=True)
+            compile_wrapper.writeline(
+                f"''', device_str='{V.graph.scheduler.current_device.type}')"
+            )
+
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+
+            # log kernel metadata for offline analysis.
+            # E.g. one can find all unaligned inner reduction and check if
+            # padding helps with the perf kernel by kernel.
+            if is_metric_table_enabled("kernel_metadata"):
+                log_kernel_metadata(kernel_name, kernel_path, src_code)
+
+        return kernel_name
+
+    def codegen_template(
+        self, template_node, epilogue_nodes, only_gen_src_code=False
+    ) -> Optional[str]:
+        """
+        Codegen a triton template
+
+        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
+        """
+        _, (numel, rnumel) = template_node.group
+        assert rnumel == 1
+        kernel, render = template_node.node.make_kernel_render(template_node.node)
+        with kernel:
+            if not only_gen_src_code:
+                for node in [template_node, *epilogue_nodes]:
+                    node.mark_run()
+            partial_code = render()
+            for node in epilogue_nodes:
+                node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+
+        # finalize must be called after adding epilogue above
+        with V.set_kernel_handler(kernel):
+            # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
+            node_schedule = [template_node, *epilogue_nodes]
+
+            if config.benchmark_kernel:
+                num_gb = kernel.estimate_kernel_num_bytes() / 1e9
+                grid_args = V.graph.sizevars.size_hints(kernel.call_sizes)
+                assert kernel.meta is not None, "meta is None"
+                grid = kernel.grid_fn(*grid_args, kernel.meta)
+                src_code = (
+                    f"{kernel.imports_for_benchmark_kernel()}\n"
+                    f"{src_code}\n"
+                    f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}"
+                )
+
+            if only_gen_src_code:
+                return src_code
+
+            kernel_name = self.define_kernel(src_code, node_schedule)
+
+        self.codegen_comment(node_schedule)
+        kernel.call_kernel(kernel_name, template_node.node)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
+        self.scheduler.free_buffers()
+        return None
+
+    def codegen_sync(self):
+        V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
+
+    def codegen_foreach(self, foreach_node):
+        from .triton_foreach import ForeachKernel
+
+        for partitions_with_metadata in ForeachKernel.horizontal_partition(
+            foreach_node.get_subkernel_nodes(), self
+        ):
+            kernel = ForeachKernel()
+            for nodes, tiled_groups, numel, rnumel in partitions_with_metadata:
+                node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+                (
+                    reduction_hint_val,
+                    mutations,
+                    index_dtype,
+                ) = self.get_kernel_args(node_schedule, numel, rnumel)
+
+                subkernel = kernel.create_sub_kernel(
+                    *tiled_groups,
+                    reduction_hint=reduction_hint_val,
+                    mutations=mutations,
+                    index_dtype=index_dtype,
+                )
+
+                self.codegen_node_schedule_with_kernel(
+                    node_schedule,
+                    subkernel,
+                )
+
+                with V.set_kernel_handler(subkernel):
+                    for node in node_schedule:
+                        if node not in (EnableReduction, DisableReduction):
+                            node.mark_run()
+                V.graph.removed_buffers |= subkernel.removed_buffers
+                V.graph.inplaced_to_remove |= subkernel.inplaced_to_remove
+
+            src_code = kernel.codegen_kernel()
+            kernel_name = self.define_kernel(src_code, [foreach_node])
+            self.codegen_comment([foreach_node])
+            kernel.call_kernel(V.graph.wrapper_code, kernel_name)
+
+        self.scheduler.free_buffers()
+
+    @staticmethod
+    @functools.lru_cache(32)
+    def candidate_tilings(node):
+        ranges, reduction_ranges = node.get_ranges()
+        if len(ranges) <= 1:
+            return ()
+
+        rw = node.pointwise_read_writes()
+        assert len(rw.range_vars) == len(ranges)
+
+        # isinstance(dep, MemoryDep): this filters out StarDeps. StarDeps refer to reads
+        # that need to access the entire tensor; they don't contribute read indexing
+        # information (and practically, they don't have dep.index so they can't be used
+        # for stride_hints below
+        dep_sources = [rw.reads, rw.writes]
+        assert all(
+            isinstance(dep, (MemoryDep, StarDep))
+            for dep in itertools.chain.from_iterable(dep_sources)
+        )
+        deps = [
+            dep
+            for dep in itertools.chain.from_iterable(dep_sources)
+            if dep.name not in V.graph.removed_buffers and isinstance(dep, MemoryDep)
+        ]
+        write_names = {dep.name for dep in rw.writes}
+
+        tilings: List[CandidateTiling] = []
+
+        for dep in deps:
+            strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars)
+            assert len(strides) == len(ranges)
+            try:
+                split = strides.index(1) + 1
+                if split == len(ranges):
+                    continue
+                if all(s == 0 for s in strides[split:]):
+                    # if this is a broadcasted tensor and all dimensions after split are broadcast,
+                    # this is not a real split
+                    continue
+
+            except ValueError:
+                continue
+            tiled_groups = (
+                V.graph.sizevars.simplify(sympy_product(ranges[:split])),
+                V.graph.sizevars.simplify(sympy_product(ranges[split:])),
+            )
+            # score by number of elements
+            score = V.graph.sizevars.size_hint(
+                sympy_product(
+                    size for size, stride in zip(ranges, strides) if stride != 0
+                )
+            )
+            if dep.name in write_names:
+                # ngimel said contiguous writes is more important than reads
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[0]):
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[1]):
+                score *= 2
+
+            if (
+                V.graph.sizevars.size_hint(
+                    score - sympy_product(itertools.chain(ranges, reduction_ranges))
+                )
+                >= 0
+            ):
+                tilings.append(CandidateTiling(tiled_groups, score, dep.name))
+        return tilings
+
+    @classmethod
+    def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)):
+        """
+        Heuristics to decide how to tile kernels.
+        Currently, we tile based on stride-1 dimensions.
+
+        Returns:
+            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`
+
+        """
+        if reduction_numel != 1 or config.triton.max_tiles <= 1:
+            # TODO(jansel): should we tile reductions?
+            # do perf hint here if stride-1 dim is not being reduced
+            if perf_hint_log.level <= logging.WARNING:
+                for node in EnableReduction.filter(node_schedule):
+                    if len(cls.candidate_tilings(node)) > 0:
+                        perf_hint_log.info("reduction over non-contiguous dims")
+                        break
+            return (numel, reduction_numel)
+
+        seen_names = set()
+        candidate_tiles: Counter[Any] = collections.Counter()
+        for node in EnableReduction.filter(node_schedule):
+            for tiling in cls.candidate_tilings(node):
+                if tiling.name in seen_names:
+                    continue
+                seen_names.add(tiling.name)
+                candidate_tiles[tiling.tiling] += tiling.score
+
+        ranked_tilings = [tiling for tiling, score in candidate_tiles.most_common()]
+
+        if config.triton.max_tiles >= 3:
+            # Consider adding a third dimension of tiling, but only
+            # when a1 is a multiple of b1; otherwise, you have a lot
+            # of stragglers which is annoying to generate code for.
+            #
+            # NB: More than three max tiles is not enabled by default.
+
+            # Add one 3D tiling choice
+            for i in range(1, len(ranked_tilings)):
+                a0, a1 = ranked_tilings[0]
+                b0, b1 = ranked_tilings[i]
+                if V.graph.sizevars.size_hint(a1 - b1) == 0:
+                    continue
+                if V.graph.sizevars.size_hint(a1 - b1) < 0:
+                    # swap so a0 is bigger
+                    a0, a1 = ranked_tilings[i]
+                    b0, b1 = ranked_tilings[0]
+                assert V.graph.sizevars.size_hint(a1 - b1) > 0
+                if V.graph.sizevars.statically_known_multiple_of(a1, b1):
+                    tiling = (a0, FloorDiv(a1, b1), b1)
+                    ranked_tilings = [tiling] + ranked_tilings
+                    break  # only 1 choice for now
+
+        if len(ranked_tilings) > 1:
+            perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
+
+        for tiled_groups in ranked_tilings:
+            new_groups = (*tiled_groups, reduction_numel)
+            if all(
+                TritonKernel.is_compatible(new_groups, node.get_ranges())
+                for node in node_schedule
+                if isinstance(node, scheduler.SchedulerNode)
+            ):
+                return new_groups
+
+        return (numel, reduction_numel)
+
+    def flush(self):
+        pass
+
+    def ready_to_flush(self) -> bool:
+        return False
+
+    def benchmark_fused_nodes(self, nodes):
+        # empty last_usage. May cause more aggressive 'evict_last'. Should be fine.
+        for n in nodes:
+            n.last_usage = set()
+
+        if not nodes[0].is_template():
+            _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+            node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+
+            tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
+            reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+                node_schedule, numel, rnumel
+            )
+
+            kernel = TritonKernel(
+                *tiled_groups,
+                reduction_hint=reduction_hint_val,
+                mutations=mutations,
+                index_dtype=index_dtype,
+            )
+
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+            with config.patch("benchmark_kernel", True), V.set_kernel_handler(kernel):
+                src_code = kernel.codegen_kernel()
+        else:
+            template_node = nodes[0]
+            epilogue_nodes = nodes[1:]
+
+            with config.patch("benchmark_kernel", True):
+                src_code = self.codegen_template(
+                    template_node, epilogue_nodes, only_gen_src_code=True
+                )
+
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
+        mod = PyCodeCache.load(src_code)
+
+        def cache_file_path():
+            assert mod.__file__ is not None
+            return os.path.splitext(mod.__file__)[0] + ".kernel_perf"
+
+        def load_cache():
+            path = cache_file_path()
+            if os.path.exists(path):
+                with open(path) as fd:
+                    return float(fd.read())
+            return None
+
+        def store_cache():
+            path = cache_file_path()
+            with open(path, "w") as fd:
+                fd.write(str(ms))
+
+        log.debug(
+            "kernel src code for %s written to: %s",
+            {n.get_name() for n in nodes},
+            mod.__file__,
+        )
+        ms = load_cache()
+        if ms is not None:
+            return ms, mod.__file__
+
+        args = mod.get_args()
+        call = mod.call
+        wrapped_jit_function = mod.triton_
+
+        # call once to trigger the compilation
+        call(wrapped_jit_function.clone_args(*args)[0])
+
+        launchers = wrapped_jit_function.launchers
+        assert len(launchers) == 1
+        if launchers[0].n_spills > 0:
+            # skip benchmarking the kernel if there are register spills
+            ms = float("inf")
+        else:
+            # We have to clone the inplace updated arguments to avoid earlier calls
+            # generating out of range indices for later calls.
+            ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
+
+        log.debug(
+            "The fused kernel for %s took %.3f ms to run",
+            {n.get_name() for n in nodes},
+            ms,
+        )
+        store_cache()
+        return ms, mod.__file__
+
+
+@dataclasses.dataclass
+class CandidateTiling:
+    tiling: Tuple[sympy.Expr, sympy.Expr]
+    score: int  # higher is better
+    name: Optional[str] = None
+
+    @staticmethod
+    def is_good_size(s):
+        """Somewhat arbitrary heuristic used to boost scores for some sizes"""
+        s = V.graph.sizevars.size_hint(s)
+        return s >= 32 and (s % 32 == 0)
+
+
+class DisableReduction:
+    """
+    Marker to invoke `kernel.disable_reduction()`.  This closes a
+    reduction loop and allows for pointwise ops to occur on the output
+    of a reduction.
+    """
+
+
+class EnableReduction:
+    """
+    Marker to end a DisableReduction block.
+    """
+
+    @staticmethod
+    def filter(node_schedule):
+        """
+        Get the nodes from node_schedule skipping those in a
+        DisableReduction block.
+        """
+        disabled = False
+        for node in node_schedule:
+            if node in (EnableReduction, DisableReduction):
+                # Don't tile stuff outside the main reduction loop
+                disabled = node is DisableReduction
+            elif disabled:
+                pass
+            else:
+                yield node
+
+
+class CantSplit(Exception):
+    pass
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e153527cbf4a19a5b12dca1bdd60aa44c663df5a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17ae46cd458107f8aba65c20073bcd1e304c8ea5
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a6ae6f2c585d3cf76b0cdaefe65859cb7abdf04
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d60d520dae0946fbb2f57c45f87fd7e3257a7197
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8defda6cab2941861c8f65692de158d395b20787
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b11aa9b0c8f12d918a28a63c7c9ab2a8a7c7172
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4a472904d764b7befe3e9e1c14e8189988bd056
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ba536fc968a397f2b2158201e629a57c488322
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d04bc3a87a8f98568c0cf2e3d06c0959d5de0c7f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95da17d3331c721746fed9ab37f8e4041cfb41f8
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81c2a1d20391c204decfdc6ac591f70e2f57f5d4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c677c770db9f5006a36013799dd7370a8c49b034
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19d77fddf7a509aecd317166ea57ff04845cd1ef
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39345b119bff177b95a46b1fd3fe26b3331cebfa
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4271e0c5ee50c533d86b50be243a8efa41c78a4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
@@ -0,0 +1,218 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
new file mode 100644
index 0000000000000000000000000000000000000000..e921a07d46774fcd28e416b1043f12138727abd7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
@@ -0,0 +1,233 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
+view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_7_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_7_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_7_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_7_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b17d76e12794b8407063c84d5dbb55b3aac25c99
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py
@@ -0,0 +1 @@
+from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c5b3ededacc7beb426c70d4a4f3c4e15eb7cfd9
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..209463713e49f2d11d9d5fc4c7705c0bb3353900
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9fcd063584640bd04d824231099b85db72f15e1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
@@ -0,0 +1,82 @@
+import logging
+from typing import List
+
+from ..select_algorithm import autotune_select_algorithm, ChoiceCaller, TritonTemplate
+from .mm_common import mm_args, mm_configs, mm_grid, mm_options
+
+log = logging.getLogger(__name__)
+
+uint4x2_mixed_mm_template = TritonTemplate(
+    name="uint4x2_mixed_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn)
+    b_shifts = 4*(rk%2)
+    b_subs = 8*(1-(rk%2))
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        b = ((b >> b_shifts[:, None]) & 0xF) - 8
+        b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K//2 * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True)
+    choices: List[ChoiceCaller] = []
+    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+    for config in mm_configs(m, n, k):
+        uint4x2_mixed_mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2),
+            layout=layout,
+            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+        )
+    return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b042126d99e217b789853d4a52b871918f9162f6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py
@@ -0,0 +1,1412 @@
+r"""
+This package adds support for CUDA tensor types.
+
+It implements the same function as CPU tensors, but they utilize
+GPUs for computation.
+
+It is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports CUDA.
+
+:ref:`cuda-semantics` has more details about working with CUDA.
+"""
+
+
+import contextlib
+import importlib
+import os
+import sys
+import threading
+import traceback
+import warnings
+from functools import lru_cache
+from typing import Any, Callable, cast, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from torch.types import Device
+from .. import device as _device
+from .._utils import _dummy_type, _LazySeedTracker, classproperty
+from ._utils import _get_device_index
+from .graphs import (
+    CUDAGraph,
+    graph,
+    graph_pool_handle,
+    is_current_stream_capturing,
+    make_graphed_callables,
+)
+from .streams import Event, ExternalStream, Stream
+
+try:
+    from torch._C import _cudart  # type: ignore[attr-defined]
+except ImportError:
+    _cudart = None
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+
+_HAS_PYNVML = False
+_PYNVML_ERR = None
+try:
+    import pynvml  # type: ignore[import]
+
+    _HAS_PYNVML = True
+except ImportError as err:
+    _PYNVML_ERR = err  # sometimes a lib is installed but the import fails for some other reason, so we log the error for later
+
+_lazy_seed_tracker = _LazySeedTracker()
+
+# Define dummy _CudaDeviceProperties type if PyTorch was compiled without CUDA
+if hasattr(torch._C, "_CudaDeviceProperties"):
+    _CudaDeviceProperties = torch._C._CudaDeviceProperties
+else:
+    _CudaDeviceProperties = _dummy_type("_CudaDeviceProperties")  # type: ignore[assignment, misc]
+
+if hasattr(torch._C, "_cuda_exchangeDevice"):
+    _exchange_device = torch._C._cuda_exchangeDevice
+else:
+
+    def _exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without CUDA support")
+
+
+if hasattr(torch._C, "_cuda_maybeExchangeDevice"):
+    _maybe_exchange_device = torch._C._cuda_maybeExchangeDevice
+else:
+
+    def _maybe_exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without CUDA support")
+
+
+has_half: bool = True
+has_magma: bool = torch._C._has_magma
+
+default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with CUDA support."""
+    return hasattr(torch._C, "_cuda_getDeviceCount")
+
+
+def _nvml_based_avail() -> bool:
+    return os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1"
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if CUDA is currently available."""
+    if not _is_compiled():
+        return False
+    if _nvml_based_avail():
+        # The user has set an env variable to request this availability check that attempts to avoid fork poisoning by
+        # using NVML at the cost of a weaker CUDA availability assessment. Note that if NVML discovery/initialization
+        # fails, this assessment falls back to the default CUDA Runtime API assessment (`cudaGetDeviceCount`)
+        return device_count() > 0
+    else:
+        # The default availability inspection never throws and returns 0 if the driver is missing or can't
+        # be initialized. This uses the CUDA Runtime API `cudaGetDeviceCount` which in turn initializes the CUDA Driver
+        # API via `cuInit`
+        return torch._C._cuda_getDeviceCount() > 0
+
+
+def is_bf16_supported():
+    r"""Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16."""
+    # Check for ROCm, if true return true, no ROCM_VERSION check required,
+    # since it is supported on AMD GPU archs.
+    if torch.version.hip:
+        return True
+
+    device = torch.cuda.current_device()
+
+    # Check for CUDA version and device compute capability.
+    # This is a fast way to check for it.
+    cuda_version = torch.version.cuda
+    if (
+        cuda_version is not None
+        and int(cuda_version.split(".")[0]) >= 11
+        and torch.cuda.get_device_properties(device).major >= 8
+    ):
+        return True
+
+    # Finally try to create a bfloat16 device.
+    return _check_bf16_tensor_supported(device)
+
+
+@lru_cache(maxsize=16)
+def _check_bf16_tensor_supported(device: _device_t):
+    try:
+        torch.tensor([1.0], dtype=torch.bfloat16, device=device)
+        return True
+    except Exception:
+        return False
+
+
+def _sleep(cycles):
+    torch._C._cuda_sleep(cycles)
+
+
+def _check_capability():
+    incorrect_binary_warn = """
+    Found GPU%d %s which requires CUDA_VERSION >= %d to
+     work properly, but your PyTorch was compiled
+     with CUDA_VERSION %d. Please install the correct PyTorch binary
+     using instructions from https://pytorch.org
+    """
+
+    old_gpu_warn = """
+    Found GPU%d %s which is of cuda capability %d.%d.
+    PyTorch no longer supports this GPU because it is too old.
+    The minimum cuda capability supported by this library is %d.%d.
+    """
+
+    if torch.version.cuda is not None:  # on ROCm we don't want this check
+        CUDA_VERSION = torch._C._cuda_getCompiledVersion()
+        for d in range(device_count()):
+            capability = get_device_capability(d)
+            major = capability[0]
+            minor = capability[1]
+            name = get_device_name(d)
+            current_arch = major * 10 + minor
+            min_arch = min(
+                (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()),
+                default=35,
+            )
+            if current_arch < min_arch:
+                warnings.warn(
+                    old_gpu_warn
+                    % (d, name, major, minor, min_arch // 10, min_arch % 10)
+                )
+
+
+def _check_cubins():
+    incompatible_device_warn = """
+{} with CUDA capability sm_{} is not compatible with the current PyTorch installation.
+The current PyTorch install supports CUDA capabilities {}.
+If you want to use the {} GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/
+"""
+    if torch.version.cuda is None:  # on ROCm we don't want this check
+        return
+    arch_list = get_arch_list()
+    if len(arch_list) == 0:
+        return
+    supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch]
+    for idx in range(device_count()):
+        cap_major, cap_minor = get_device_capability(idx)
+        # NVIDIA GPU compute architectures are backward compatible within major version
+        supported = any(sm // 10 == cap_major for sm in supported_sm)
+        if not supported:
+            device_name = get_device_name(idx)
+            capability = cap_major * 10 + cap_minor
+            warnings.warn(
+                incompatible_device_warn.format(
+                    device_name, capability, " ".join(arch_list), device_name
+                )
+            )
+
+
+def is_initialized():
+    r"""Return whether PyTorch's CUDA state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _lazy_call(callable, **kwargs):
+    if is_initialized():
+        callable()
+    else:
+        # TODO(torch_deploy): this accesses linecache, which attempts to read the
+        # file system to get traceback info. Patch linecache or do something
+        # else here if this ends up being important.
+        global _lazy_seed_tracker
+        if kwargs.get("seed_all", False):
+            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
+        elif kwargs.get("seed", False):
+            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
+        else:
+            # Don't store the actual traceback to avoid memory cycle
+            _queued_calls.append((callable, traceback.format_stack()))
+
+
+_lazy_call(_check_capability)
+_lazy_call(_check_cubins)
+
+
+class DeferredCudaCallError(Exception):
+    pass
+
+
+OutOfMemoryError = torch._C._OutOfMemoryError
+
+
+def init():
+    r"""Initialize PyTorch's CUDA state.
+
+    You may need to call this explicitly if you are interacting with
+    PyTorch via its C API, as Python bindings for CUDA functionality
+    will not be available until this initialization takes place.
+    Ordinary users should not need this, as all of PyTorch's CUDA methods
+    automatically initialize CUDA state on-demand.
+
+    Does nothing if the CUDA state is already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # We be double-checked locking, boys!  This is OK because
+        # the above test was GIL protected anyway.  The inner test
+        # is for when a thread blocked on some other thread which was
+        # doing the initialization; when they get the lock, they will
+        # find there is nothing left to do.
+        if is_initialized():
+            return
+        # It is important to prevent other threads from entering _lazy_init
+        # immediately, while we are still guaranteed to have the GIL, because some
+        # of the C calls we make below will release the GIL
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize CUDA in forked subprocess. To use CUDA with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not hasattr(torch._C, "_cuda_getDeviceCount"):
+            raise AssertionError("Torch not compiled with CUDA enabled")
+        if _cudart is None:
+            raise AssertionError(
+                "libcudart functions unavailable. It looks like you have a broken build?"
+            )
+        # This function throws if there's a driver initialization error, no GPUs
+        # are found or any other error occurs
+        if "CUDA_MODULE_LOADING" not in os.environ:
+            os.environ["CUDA_MODULE_LOADING"] = "LAZY"
+        torch._C._cuda_init()
+        # Some of the queued calls may reentrantly call _lazy_init();
+        # we need to just return without initializing in that case.
+        # However, we must not let any *other* threads in!
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"CUDA call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"CUDA call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise DeferredCudaCallError(msg) from e
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+def cudart():
+    _lazy_init()
+    return _cudart
+
+
+class cudaStatus:
+    SUCCESS: int = 0
+    ERROR_NOT_READY: int = 34
+
+
+class CudaError(RuntimeError):
+    def __init__(self, code: int) -> None:
+        msg = _cudart.cudaGetErrorString(_cudart.cudaError(code))
+        super().__init__(f"{msg} ({code})")
+
+
+def check_error(res: int) -> None:
+    if res != _cudart.cudaError.success:
+        raise CudaError(res)
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.cuda._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.cuda._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.cuda._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.cuda._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a GPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_cuda else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Usage of this function is discouraged in favor of :any:`device`. In most
+    cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable.
+
+    Args:
+        device (torch.device or int): selected device. This function is a no-op
+            if this argument is negative.
+    """
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._cuda_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            name. This function is a no-op if this argument is a negative
+            integer. It uses the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]:
+    r"""Get the cuda capability of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            device capability. This function is a no-op if this argument is
+            a negative integer. It uses the current device, given by
+            :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        tuple(int, int): the major and minor cuda capability of the device
+    """
+    prop = get_device_properties(device)
+    return prop.major, prop.minor
+
+
+def get_device_properties(device: _device_t) -> _CudaDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _CudaDeviceProperties: the properties of the device
+    """
+    _lazy_init()  # will define _get_device_properties
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    return _get_device_properties(device)  # type: ignore[name-defined]
+
+
+def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
+    r"""Check if peer access between two devices is possible."""
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    peer_device = _get_device_index(peer_device)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    if peer_device < 0 or peer_device >= device_count():
+        raise AssertionError("Invalid peer device id")
+    return torch._C._cuda_canDeviceAccessPeer(device, peer_device)
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.cuda.Stream"]
+
+    def __init__(self, stream: Optional["torch.cuda.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if not torch.jit.is_scripting():
+            if self.idx is None:
+                self.idx = -1
+
+        self.src_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.cuda.default_stream(None)
+        )
+        self.dst_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.cuda.default_stream(None)
+        )
+
+    def __enter__(self):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None or CUDA device not available
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.cuda.current_stream(None)
+
+        # If the stream is not on the current device, then
+        # set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device)
+        torch.cuda.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no CUDA device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device
+        # and destination device
+        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
+            torch.cuda.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
+        torch.cuda.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
+
+
+def stream(stream: Optional["torch.cuda.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    ..Note:: In eager mode stream is of type Stream class while in JIT it is
+    an object of the custom class ``torch.classes.cuda.Stream``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and
+        device type
+
+    Args: stream_id (int): stream id in stream pool
+          device_index (int): device index in topo
+          device_type (int): enum device type
+    """
+    torch._C._cuda_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
+
+
+def _parse_visible_devices() -> Union[List[int], List[str]]:
+    r"""Parse CUDA_VISIBLE_DEVICES environment variable."""
+    var = os.getenv("CUDA_VISIBLE_DEVICES")
+    if var is None:
+        return list(range(64))
+
+    def _strtoul(s: str) -> int:
+        """Return -1 or positive integer sequence string starts with."""
+        if not s:
+            return -1
+        for idx, c in enumerate(s):
+            if not (c.isdigit() or (idx == 0 and c in "+-")):
+                break
+            if idx + 1 == len(s):
+                idx += 1
+        return int(s[:idx]) if idx > 0 else -1
+
+    def parse_list_with_prefix(lst: str, prefix: str) -> List[str]:
+        rcs: List[str] = []
+        for elem in lst.split(","):
+            # Repeated id results in empty set
+            if elem in rcs:
+                return cast(List[str], [])
+            # Anything other but prefix is ignored
+            if not elem.startswith(prefix):
+                break
+            rcs.append(elem)
+        return rcs
+
+    if var.startswith("GPU-"):
+        return parse_list_with_prefix(var, "GPU-")
+    if var.startswith("MIG-"):
+        return parse_list_with_prefix(var, "MIG-")
+    # CUDA_VISIBLE_DEVICES uses something like strtoul
+    # which makes `1gpu2,2ampere` is equivalent to `1,2`
+    rc: List[int] = []
+    for elem in var.split(","):
+        x = _strtoul(elem.strip())
+        # Repeated ordinal results in empty set
+        if x in rc:
+            return cast(List[int], [])
+        # Negative value aborts the sequence
+        if x < 0:
+            break
+        rc.append(x)
+    return rc
+
+
+def _raw_device_count_nvml() -> int:
+    r"""Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed."""
+    from ctypes import byref, c_int, CDLL
+
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return -1
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return -1
+    del nvml_h
+    return dev_count.value
+
+
+def _raw_device_uuid_nvml() -> Optional[List[str]]:
+    r"""Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed."""
+    from ctypes import byref, c_int, c_void_p, CDLL, create_string_buffer
+
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return None
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return None
+    uuids: List[str] = []
+    for idx in range(dev_count.value):
+        dev_id = c_void_p()
+        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+        if rc != 0:
+            warnings.warn("Can't get device handle")
+            return None
+        buf_len = 96
+        buf = create_string_buffer(buf_len)
+        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+        if rc != 0:
+            warnings.warn("Can't get device UUID")
+            return None
+        uuids.append(buf.raw.decode("ascii").strip("\0"))
+    del nvml_h
+    return uuids
+
+
+def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]:
+    r"""Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials IDs."""
+
+    def uuid_to_orinal(candidate: str, uuids: List[str]) -> int:
+        best_match = -1
+        for idx, uuid in enumerate(uuids):
+            if not uuid.startswith(candidate):
+                continue
+            # Ambiguous candidate
+            if best_match != -1:
+                return -1
+            best_match = idx
+        return best_match
+
+    rc: List[int] = []
+    for candidate in candidates:
+        idx = uuid_to_orinal(candidate, uuids)
+        # First invalid ordinal stops parsing
+        if idx < 0:
+            break
+        # Duplicates result in empty set
+        if idx in rc:
+            return cast(List[int], [])
+        rc.append(idx)
+    return rc
+
+
+def _device_count_nvml() -> int:
+    r"""Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
+
+    Negative value is returned if NVML discovery or initialization has failed.
+    """
+    visible_devices = _parse_visible_devices()
+    if not visible_devices:
+        return 0
+    try:
+        if type(visible_devices[0]) is str:
+            # Skip MIG parsing
+            if visible_devices[0].startswith("MIG-"):
+                return -1
+            uuids = _raw_device_uuid_nvml()
+            if uuids is None:
+                return -1
+            visible_devices = _transform_uuid_to_ordinals(
+                cast(List[str], visible_devices), uuids
+            )
+        else:
+            raw_cnt = _raw_device_count_nvml()
+            if raw_cnt <= 0:
+                return raw_cnt
+            # Trim the list up to a maximum available device
+            for idx, val in enumerate(visible_devices):
+                if cast(int, val) >= raw_cnt:
+                    return idx
+    except OSError:
+        return -1
+    except AttributeError:
+        return -1
+    return len(visible_devices)
+
+
+def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
+    r"""Return the NVML index of the device, taking CUDA_VISIBLE_DEVICES into account."""
+    idx = _get_device_index(device, optional=True)
+    visible_devices = _parse_visible_devices()
+    if type(visible_devices[0]) is str:
+        uuids = _raw_device_uuid_nvml()
+        if uuids is None:
+            raise RuntimeError("Can't get device UUIDs")
+        visible_devices = _transform_uuid_to_ordinals(
+            cast(List[str], visible_devices), uuids
+        )
+    visible_devices = cast(List[int], visible_devices)
+    if idx < 0 or idx >= len(visible_devices):
+        raise RuntimeError(
+            f"device {idx} is not visible (CUDA_VISIBLE_DEVICES={visible_devices})"
+        )
+    return visible_devices[idx]
+
+
+@lru_cache(maxsize=1)
+def device_count() -> int:
+    r"""Return the number of GPUs available."""
+    if not _is_compiled():
+        return 0
+    # bypass _device_count_nvml() if rocm (not supported)
+    nvml_count = -1 if torch.version.hip else _device_count_nvml()
+    return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+
+
+def get_arch_list() -> List[str]:
+    r"""Return list CUDA architectures this library was compiled for."""
+    if not is_available():
+        return []
+    arch_flags = torch._C._cuda_getArchFlags()
+    if arch_flags is None:
+        return []
+    return arch_flags.split()
+
+
+def get_gencode_flags() -> str:
+    r"""Return NVCC gencode flags this library was compiled with."""
+    arch_list = get_arch_list()
+    if len(arch_list) == 0:
+        return ""
+    arch_list_ = [arch.split("_") for arch in arch_list]
+    return " ".join(
+        [
+            f"-gencode compute=compute_{arch},code={kind}_{arch}"
+            for (kind, arch) in arch_list_
+        ]
+    )
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._cuda_getDevice()
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a CUDA device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    with torch.cuda.device(device):
+        return torch._C._cuda_synchronize()
+
+
+def ipc_collect():
+    r"""Force collects GPU memory after it has been released by CUDA IPC.
+
+    .. note::
+        Checks if any sent CUDA tensors could be cleaned from the memory. Force
+        closes shared memory file used for reference counting if there is no
+        active counters. Useful when the producer process stopped actively sending
+        tensors and want to release unused memory.
+    """
+    _lazy_init()
+    return torch._C._cuda_ipc_collect()
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._cuda_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def default_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the default :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the default :class:`Stream` for the current device, given by
+            :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._cuda_getDefaultStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def current_blas_handle():
+    r"""Return cublasHandle_t pointer to current cuBLAS handle"""
+    _lazy_init()
+    return torch._C._cuda_getCurrentBlasHandle()
+
+
+def set_sync_debug_mode(debug_mode: Union[int, str]) -> None:
+    r"""Set the debug mode for cuda synchronizing operations.
+
+    Args:
+        debug_mode(str or int): if "default" or 0, don't error or warn on synchronizing operations,
+            if "warn" or 1, warn on synchronizing operations, if "error" or 2, error out synchronizing operations.
+
+    Warning:
+        This is an experimental feature, and not all synchronizing operations will trigger warning or error. In
+        particular, operations in torch.distributed and torch.sparse namespaces are not covered yet.
+    """
+    _lazy_init()
+    if isinstance(debug_mode, str):
+        if debug_mode == "default":
+            debug_mode = 0
+        elif debug_mode == "warn":
+            debug_mode = 1
+        elif debug_mode == "error":
+            debug_mode = 2
+        else:
+            raise RuntimeError(
+                "invalid value of debug_mode, expected one of `default`, `warn`, `error`"
+            )
+
+    torch._C._cuda_set_sync_debug_mode(debug_mode)
+
+
+def get_sync_debug_mode() -> int:
+    r"""Return current value of debug mode for cuda synchronizing operations."""
+    _lazy_init()
+    return torch._C._cuda_get_sync_debug_mode()
+
+
+def _get_pynvml_handler(device: Optional[Union[Device, int]] = None):
+    if not _HAS_PYNVML:
+        raise ModuleNotFoundError(
+            "pynvml does not seem to be installed or it can't be imported."
+        ) from _PYNVML_ERR
+    from pynvml import NVMLError_DriverNotLoaded
+
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded as e:
+        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
+
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return handle
+
+
+def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the percent of time over the past sample period during which global (device)
+    memory was being read or written as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler()
+
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return pynvml.nvmlDeviceGetUtilizationRates(handle).memory
+
+
+def utilization(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the percent of time over the past sample period during which one or
+    more kernels was executing on the GPU as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
+
+
+def temperature(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the average temperature of the GPU sensor in Degrees C (Centigrades).
+
+    The average temperature is computed based on past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    # 0 refers to the temperature sensor for the GPU die.
+    return pynvml.nvmlDeviceGetTemperature(handle, 0)
+
+
+def power_draw(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the average power draw of the GPU sensor in mW (MilliWatts)
+        over the past sample period as given by `nvidia-smi` for Fermi or newer fully supported devices.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetPowerUsage(handle)
+
+
+def clock_rate(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the clock speed of the GPU SM in Hz Hertz over the past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetClockInfo(handle, 1)
+
+
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+    return device
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the CUDA Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.cuda.default_generators[idx]
+
+
+def _set_rng_state_offset(
+    offset: int, device: Union[int, str, torch.device] = "cuda"
+) -> None:
+    r"""Set the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = "cuda") -> int:
+    r"""Return the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
+from .memory import *  # noqa: F403
+
+
+from .random import *  # noqa: F403
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+
+@staticmethod  # type: ignore[misc]
+def _lazy_new(cls, *args, **kwargs):
+    _lazy_init()
+    # We may need to call lazy init again if we are a forked child
+    # del _CudaBase.__new__
+    return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
+
+
+class _CudaBase:
+    is_cuda = True
+    is_sparse = False
+
+    def type(self, *args, **kwargs):
+        # We could use a Protocol here to tell mypy that self has `get_device` method
+        # but it is only available in the typing module on Python >= 3.8
+        # or on typing_extensions module on Python >= 3.6
+        with device(self.get_device()):  # type: ignore[attr-defined]
+            return super().type(*args, **kwargs)  # type: ignore[misc]
+
+    __new__ = _lazy_new
+
+
+from torch.storage import _LegacyStorage, _warn_typed_storage_removal
+
+
+class _CudaLegacyStorage(_LegacyStorage):
+    @classmethod
+    def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
+        raise RuntimeError("from_buffer: Not available for CUDA storage")
+
+    @classmethod
+    def _new_with_weak_ptr(cls, *args, **kwargs):
+        raise RuntimeError("_new_with_weak_ptr: Not available for CUDA storage")
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None):
+        raise RuntimeError("_new_shared_filename: Not available for CUDA storage")
+
+
+class ByteStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.uint8
+
+
+class DoubleStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.double
+
+
+class FloatStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.float
+
+
+class HalfStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.half
+
+
+class LongStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.long
+
+
+class IntStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int
+
+
+class ShortStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.short
+
+
+class CharStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int8
+
+
+class BoolStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bool
+
+
+class BFloat16Storage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bfloat16
+
+
+class ComplexDoubleStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cdouble
+
+
+class ComplexFloatStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cfloat
+
+
+del _LegacyStorage
+del _CudaLegacyStorage
+
+torch._storage_classes.add(DoubleStorage)
+torch._storage_classes.add(FloatStorage)
+torch._storage_classes.add(LongStorage)
+torch._storage_classes.add(IntStorage)
+torch._storage_classes.add(ShortStorage)
+torch._storage_classes.add(CharStorage)
+torch._storage_classes.add(ByteStorage)
+torch._storage_classes.add(HalfStorage)
+torch._storage_classes.add(BoolStorage)
+torch._storage_classes.add(BFloat16Storage)
+torch._storage_classes.add(ComplexDoubleStorage)
+torch._storage_classes.add(ComplexFloatStorage)
+
+
+class _WrappedTritonKernel:
+    """Just a simple wrapper to store some metadata for testing purposes."""
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.kernel_invoked = False
+
+    def __call__(self, *args, **kwargs):
+        res = self.kernel(*args, **kwargs)
+        self.kernel_invoked = True
+        return res
+
+
+def _register_triton_kernels():
+    if torch._running_with_deploy():
+        return
+
+    @_WrappedTritonKernel
+    def kernel_impl(*args, **kwargs):
+        from torch.sparse._triton_ops import bsr_dense_mm
+
+        return bsr_dense_mm(*args, skip_checks=True, **kwargs)
+
+    @_WrappedTritonKernel
+    def addmm_kernel_impl(*args, **kwargs):
+        from torch.sparse._triton_ops import bsr_dense_addmm
+
+        return bsr_dense_addmm(*args, skip_checks=True, **kwargs)
+
+    has_triton = importlib.util.find_spec("triton") is not None
+    if has_triton:
+        torch._TritonLibrary.registerOp(
+            "_triton_bsr_dense_mm_out",
+            "_triton_bsr_dense_mm_out(Tensor bsr, Tensor dense, *, Tensor(a!) out) -> Tensor(a!)",
+            kernel_impl,
+            "SparseCsrCUDA",
+        )
+
+        torch._TritonLibrary.registerOp(
+            "_triton_bsr_dense_addmm_out",
+            (
+                "_triton_bsr_dense_addmm_out(Tensor input, Tensor bsr, Tensor dense,"
+                " *, Scalar beta, Scalar alpha, Tensor(a!) out) -> Tensor(a!)"
+            ),
+            addmm_kernel_impl,
+            "SparseCsrCUDA",
+        )
+
+
+_lazy_call(_register_triton_kernels)
+
+
+from . import amp, jiterator, nvtx, profiler, sparse
+
+__all__ = [
+    # Typed storage and tensors
+    "BFloat16Storage",
+    "BFloat16Tensor",
+    "BoolStorage",
+    "BoolTensor",
+    "ByteStorage",
+    "ByteTensor",
+    "CharStorage",
+    "CharTensor",
+    "ComplexDoubleStorage",
+    "ComplexFloatStorage",
+    "DoubleStorage",
+    "DoubleTensor",
+    "FloatStorage",
+    "FloatTensor",
+    "HalfStorage",
+    "HalfTensor",
+    "IntStorage",
+    "IntTensor",
+    "LongStorage",
+    "LongTensor",
+    "ShortStorage",
+    "ShortTensor",
+    "CUDAGraph",
+    "CudaError",
+    "DeferredCudaCallError",
+    "Event",
+    "ExternalStream",
+    "OutOfMemoryError",
+    "Stream",
+    "StreamContext",
+    "amp",
+    "caching_allocator_alloc",
+    "caching_allocator_delete",
+    "can_device_access_peer",
+    "check_error",
+    "cudaStatus",
+    "cudart",
+    "current_blas_handle",
+    "current_device",
+    "current_stream",
+    "default_generators",
+    "default_stream",
+    "device",
+    "device_count",
+    "device_of",
+    "empty_cache",
+    "get_allocator_backend",
+    "CUDAPluggableAllocator",
+    "change_current_allocator",
+    "get_arch_list",
+    "get_device_capability",
+    "get_device_name",
+    "get_device_properties",
+    "get_gencode_flags",
+    "get_rng_state",
+    "get_rng_state_all",
+    "get_sync_debug_mode",
+    "graph",
+    "graph_pool_handle",
+    "graphs",
+    "has_half",
+    "has_magma",
+    "init",
+    "initial_seed",
+    "ipc_collect",
+    "is_available",
+    "is_bf16_supported",
+    "is_current_stream_capturing",
+    "is_initialized",
+    "jiterator",
+    "list_gpu_processes",
+    "make_graphed_callables",
+    "manual_seed",
+    "manual_seed_all",
+    "max_memory_allocated",
+    "max_memory_cached",
+    "max_memory_reserved",
+    "mem_get_info",
+    "memory",
+    "memory_allocated",
+    "memory_cached",
+    "memory_reserved",
+    "memory_snapshot",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "memory_summary",
+    "memory_usage",
+    "temperature",
+    "power_draw",
+    "clock_rate",
+    "nccl",
+    "nvtx",
+    "profiler",
+    "random",
+    "reset_accumulated_memory_stats",
+    "reset_max_memory_allocated",
+    "reset_max_memory_cached",
+    "reset_peak_memory_stats",
+    "seed",
+    "seed_all",
+    "set_device",
+    "set_per_process_memory_fraction",
+    "set_rng_state",
+    "set_rng_state_all",
+    "set_stream",
+    "set_sync_debug_mode",
+    "sparse",
+    "stream",
+    "streams",
+    "synchronize",
+    "utilization",
+]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce5c3b1f96681facd89669c1832accefa7f8884b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/memory.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0b0297366db73b31bd15a5ba7b30d86164b3f31
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/_sanitizer.py
@@ -0,0 +1,622 @@
+r"""
+This module introduces CUDA Sanitizer, a tool for detecting synchronization errors between kernels ran on different streams.
+
+It stores information on accesses to tensors to determine if they are synchronized
+or not. When enabled in a python program and a possible data race is detected, a
+detailed warning will be printed and the program will exit.
+
+It can be enabled either by importing this module and calling
+:func:`enable_cuda_sanitizer()` or by exporting the ``TORCH_CUDA_SANITIZER``
+environment variable.
+"""
+
+import enum
+import functools
+import inspect
+import io
+import logging
+import sys
+import textwrap
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar
+
+import torch
+import torch.utils._cuda_trace as cuda_trace
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+DEFAULT_STREAM_ID = 0
+
+TK = TypeVar("TK")
+TVa = TypeVar("TVa")
+TVb = TypeVar("TVb")
+
+DataPtr = int
+StreamId = int
+EventId = int
+SeqNum = int
+
+logger = logging.getLogger(__name__)
+
+
+class AccessType(enum.Enum):
+    READ = enum.auto()
+    WRITE = enum.auto()
+
+    def __str__(self):
+        return "reading from" if self is AccessType.READ else "writing to"
+
+
+@dataclass
+class Access:
+    r"""Stores information about a single access to a tensor by a kernel.
+
+    Args:
+        type: either AccessType.READ or AccessType.Write.
+        seq_num: the sequential number of the kernel performing the access.
+        stream: the stream id of the stream executing the kernel.
+        operator: the schema of the launched kernel, which lists the
+            arguments and return type.
+        aliases: the arguments in the schema this access corresponds to.
+        is_output: Whether the tensor was an output of the kernel.
+        stack_trace: the stack summary object captured during access.
+    """
+
+    type: AccessType
+    seq_num: SeqNum
+    stream: StreamId
+    operator: str
+    aliases: List[str]
+    is_output: bool
+    stack_trace: traceback.StackSummary
+
+
+class SynchronizationError(Exception):
+    """Base class for errors detected by CUDA Sanitizer."""
+
+    pass
+
+
+class UnsynchronizedAccessError(SynchronizationError):
+    """Stores information about two unsynchronized accesses to one data pointer."""
+
+    def __init__(
+        self,
+        data_ptr: DataPtr,
+        allocation_stack_trace: Optional[traceback.StackSummary],
+        current_access: Access,
+        previous_access: Access,
+    ):
+        self.data_ptr = data_ptr
+        self.allocation_stack_trace = allocation_stack_trace
+        self.current_access = current_access
+        self.previous_access = previous_access
+
+    def __str__(self):
+        def format_access(access: Access):
+            message.write(f"{access.operator}\n{access.type}")
+            if access.aliases:
+                message.write(" argument(s) " + ", ".join(access.aliases))
+                if access.is_output:
+                    message.write(", and to")
+            if access.is_output:
+                message.write(" the output")
+            message.write(
+                f"\nWith stack trace:\n{''.join(access.stack_trace.format())}\n"
+            )
+
+        with io.StringIO() as message:
+            message.write(
+                textwrap.dedent(
+                    f"""\
+                    ============================
+                    CSAN detected a possible data race on tensor with data pointer {self.data_ptr}
+                    Access by stream {self.current_access.stream} during kernel:
+                    """
+                )
+            )
+            format_access(self.current_access)
+
+            message.write(
+                f"Previous access by stream {self.previous_access.stream} during kernel:\n"
+            )
+            format_access(self.previous_access)
+
+            if self.allocation_stack_trace:
+                message.write(
+                    "Tensor was allocated with stack trace:\n"
+                    f"{''.join(self.allocation_stack_trace.format())}"
+                )
+            else:
+                message.write("Trace for tensor allocation not found.")
+            return message.getvalue()
+
+
+class CUDASanitizerErrors(Exception):
+    """Wrapper class for errors reported by CUDA Sanitizer."""
+
+    def __init__(self, errors: List[SynchronizationError]):
+        self.errors = errors
+
+    def __str__(self):
+        return f"detected {len(self.errors)} errors"
+
+
+@dataclass
+class TensorInfo:
+    r"""Stores information about a single tensor and recent accesses to it.
+
+    Args:
+        allocation_stack_trace: the stack summary object captured during tensor
+            allocation. Can be ``None`` if the allocation wasn't caught by CSAN.
+        reads: list of read accesses to the tensor that were performed since
+            the last write.
+        write: the last write access to the tensor.
+    """
+
+    allocation_stack_trace: Optional[traceback.StackSummary]
+    reads: List[Access] = field(default_factory=list)
+    write: Optional[Access] = None
+
+
+class _TensorsAccessed:
+    def __init__(self):
+        self.accesses: Dict[DataPtr, TensorInfo] = {}
+
+    def ensure_tensor_exists(self, data_ptr: DataPtr) -> None:
+        if data_ptr not in self.accesses:
+            logger.info(
+                "Found tensor with pointer: %s, but no matching tensor "
+                "allocation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                data_ptr,
+            )
+            self.create_tensor(data_ptr, None)
+
+    def ensure_tensor_does_not_exist(self, data_ptr: DataPtr) -> None:
+        if data_ptr in self.accesses:
+            logger.info(
+                "Found duplicate tensor allocation in the trace for tensor with "
+                "pointer: %s. Assuming the trace for tensor deallocation "
+                "wasn't caught and backfilling it now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                data_ptr,
+            )
+            self.delete_tensor(data_ptr)
+
+    def create_tensor(
+        self, data_ptr: DataPtr, stack_trace: Optional[traceback.StackSummary]
+    ) -> None:
+        self.accesses[data_ptr] = TensorInfo(stack_trace)
+
+    def delete_tensor(self, data_ptr: DataPtr) -> None:
+        del self.accesses[data_ptr]
+
+    def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool:
+        return True if self.accesses[data_ptr].reads else False
+
+    def get_allocation_stack_trace(
+        self, data_ptr: DataPtr
+    ) -> Optional[traceback.StackSummary]:
+        return self.accesses[data_ptr].allocation_stack_trace
+
+    def get_write(self, data_ptr: DataPtr) -> Optional[Access]:
+        return self.accesses[data_ptr].write
+
+    def get_reads(self, data_ptr: DataPtr) -> List[Access]:
+        return self.accesses[data_ptr].reads
+
+    def add_read(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].reads.append(access)
+
+    def set_write(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].write = access
+        self.accesses[data_ptr].reads = []
+
+
+class StreamSynchronizations:
+    def __init__(self):
+        self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {}
+        self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {}
+        self.host_sync_state: Dict[StreamId, SeqNum] = {}
+        self.create_stream(DEFAULT_STREAM_ID)
+
+    def _ensure_stream_exists(self, stream: StreamId) -> None:
+        if stream not in self.current_sync_states:
+            logger.info(
+                "Found Stream with id: %s, but no matching stream "
+                "creation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                stream,
+            )
+            self.create_stream(stream)
+
+    def _ensure_event_exists(self, event: EventId) -> None:
+        if event not in self.recorded_sync_states:
+            logger.info(
+                "Found Event with id: %s, but no matching event "
+                "creation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                event,
+            )
+            self.create_event(event)
+
+    def _ensure_event_does_not_exist(self, event: EventId) -> None:
+        if event in self.recorded_sync_states:
+            logger.info(
+                "Found duplicate event creation in the trace for event with "
+                "id: %s. Assuming the trace for event deletion wasn't caught "
+                "and backfilling it now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                event,
+            )
+            self.delete_event(event)
+
+    def create_stream(self, stream: StreamId) -> None:
+        if stream in self.current_sync_states:
+            logger.info(
+                "Found duplicate Stream creation in the trace for Stream with "
+                "id: %s. PyTorch Streams are only created once, so this "
+                "trace entry is ignored.",
+                stream,
+            )
+        else:
+            self.host_sync_state[stream] = 0
+            self.current_sync_states[stream] = self.host_sync_state.copy()
+
+    def create_event(self, event: EventId) -> None:
+        self._ensure_event_does_not_exist(event)
+        self.recorded_sync_states[event] = {}
+
+    def delete_event(self, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        del self.recorded_sync_states[event]
+
+    def update_seq_num(self, stream: StreamId, seq_num: SeqNum) -> None:
+        self._ensure_stream_exists(stream)
+        self.current_sync_states[stream][stream] = seq_num
+
+    def record_state(self, event: EventId, stream: StreamId) -> None:
+        self._ensure_event_exists(event)
+        self._ensure_stream_exists(stream)
+        self.recorded_sync_states[event] = self.current_sync_states[stream].copy()
+
+    def _state_wait_for_other(
+        self, state: Dict[StreamId, SeqNum], other: Dict[StreamId, SeqNum]
+    ) -> None:
+        for stream, seq_num in other.items():
+            state[stream] = max(state.get(stream, -1), seq_num)
+
+    def stream_wait_for_event(self, stream: StreamId, event: EventId) -> None:
+        self._ensure_stream_exists(stream)
+        self._ensure_event_exists(event)
+        self._state_wait_for_other(
+            self.current_sync_states[stream], self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_event(self, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        for stream in self.current_sync_states.keys():
+            self.stream_wait_for_event(stream, event)
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_stream(self, stream: StreamId) -> None:
+        self._ensure_stream_exists(stream)
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.current_sync_states[stream])
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.current_sync_states[stream]
+        )
+
+    def sync_all_streams(self) -> None:
+        for stream, state in self.current_sync_states.items():
+            self.host_sync_state[stream] = state[stream]
+
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.host_sync_state)
+
+    def is_ordered_after(
+        self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId
+    ) -> bool:
+        self._ensure_stream_exists(current_stream)
+        self._ensure_stream_exists(other_stream)
+        return seq_num <= self.current_sync_states[current_stream].get(other_stream, -1)
+
+
+class EventHandler:
+    """Analyzes CSAN trace for synchronization errors.
+
+    Stores information on each stream's synchronizations with other streams as well
+    as tensor accesses to determine whether a given kernel launch might cause a
+    data race.
+    """
+
+    def __init__(self):
+        self.tensors_accessed = _TensorsAccessed()
+        self.syncs = StreamSynchronizations()
+        self.seq_num: SeqNum = 0
+
+    def _handle_kernel_launch(
+        self,
+        stream: StreamId,
+        read_only: Set[DataPtr],
+        read_write: Set[DataPtr],
+        outputs: Set[DataPtr],
+        operator: str,
+        tensor_aliases: Dict[int, List[str]],
+    ) -> List[SynchronizationError]:
+        def check_conflict(
+            data_ptr: DataPtr, current_access: Access, previous_access: Optional[Access]
+        ) -> None:
+            if previous_access is None:
+                return
+            if not self.syncs.is_ordered_after(
+                current_access.stream, previous_access.seq_num, previous_access.stream
+            ):
+                error_list.append(
+                    UnsynchronizedAccessError(
+                        data_ptr,
+                        self.tensors_accessed.get_allocation_stack_trace(data_ptr),
+                        current_access,
+                        previous_access,
+                    )
+                )
+
+        error_list: List[SynchronizationError] = []
+        self.seq_num += 1
+        self.syncs.update_seq_num(stream, self.seq_num)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(inspect.currentframe()), lookup_lines=False
+        )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
+
+        for data_ptr in read_only:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.READ,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_aliases[data_ptr],
+                data_ptr in outputs,
+                stack_trace,
+            )
+            check_conflict(
+                data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+            )
+            self.tensors_accessed.add_read(data_ptr, current_access)
+
+        for data_ptr in read_write:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.WRITE,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_aliases[data_ptr],
+                data_ptr in outputs,
+                stack_trace,
+            )
+            if self.tensors_accessed.were_there_reads_since_last_write(data_ptr):
+                for previous_access in self.tensors_accessed.get_reads(data_ptr):
+                    check_conflict(data_ptr, current_access, previous_access)
+            else:
+                check_conflict(
+                    data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+                )
+            self.tensors_accessed.set_write(data_ptr, current_access)
+
+        return error_list
+
+    def _handle_event_creation(self, event: EventId) -> None:
+        self.syncs.create_event(event)
+
+    def _handle_event_deletion(self, event: EventId) -> None:
+        self.syncs.delete_event(event)
+
+    def _handle_event_record(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.record_state(event, stream)
+
+    def _handle_event_wait(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.stream_wait_for_event(stream, event)
+
+    def _handle_memory_allocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(inspect.currentframe()), lookup_lines=False
+        )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
+        self.tensors_accessed.create_tensor(
+            data_ptr,
+            stack_trace,
+        )
+
+    def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_exists(data_ptr)
+        self.tensors_accessed.delete_tensor(data_ptr)
+
+    def _handle_stream_creation(self, stream: StreamId) -> None:
+        self.syncs.create_stream(stream)
+
+    def _handle_device_synchronization(self) -> None:
+        self.syncs.sync_all_streams()
+
+    def _handle_stream_synchronization(self, stream: StreamId) -> None:
+        self.syncs.all_streams_wait_for_stream(stream)
+
+    def _handle_event_synchronization(self, event: EventId) -> None:
+        self.syncs.all_streams_wait_for_event(event)
+
+
+def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]:
+    for arg, value in a.items():
+        if arg in b:
+            yield arg, value, b[arg]
+
+
+def zip_arguments(
+    schema: torch.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Iterator[Tuple[torch.Argument, Any]]:
+    schema_args = schema.arguments[: len(args)]
+    schema_kwargs = {arg.name: arg for arg in schema.arguments[len(args) :]}
+
+    yield from zip(schema_args, args)
+
+    for _, argument, value in zip_by_key(schema_kwargs, kwargs):
+        yield (argument, value)
+
+
+class ArgumentHandler:
+    def __init__(self):
+        self.dataptrs_read: Set[DataPtr] = set()
+        self.dataptrs_written: Set[DataPtr] = set()
+        self.tensor_aliases: Dict[DataPtr, List[str]] = dict()
+        self.outputs: Set[DataPtr] = set()
+
+    def _handle_argument(
+        self,
+        value: Any,
+        is_write: bool,
+        name: Optional[str] = None,
+        is_output: bool = False,
+    ) -> None:
+        if isinstance(value, torch.Tensor) and value.is_cuda:
+            data_ptr = value.data_ptr()
+            if is_write:
+                self.dataptrs_written.add(data_ptr)
+            else:
+                self.dataptrs_read.add(data_ptr)
+
+            self.tensor_aliases.setdefault(data_ptr, [])
+            if name is not None:
+                self.tensor_aliases[data_ptr].append(name)
+            if is_output:
+                self.outputs.add(data_ptr)
+
+    def parse_inputs(
+        self,
+        schema: torch.FunctionSchema,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> None:
+        for argument, value in zip_arguments(schema, args, kwargs):
+            is_write = argument.alias_info is not None and argument.alias_info.is_write
+            pytree.tree_map_(
+                functools.partial(
+                    self._handle_argument, is_write=is_write, name=argument.name
+                ),
+                value,
+            )
+
+    def parse_outputs(self, outputs: Any) -> None:
+        pytree.tree_map_(
+            functools.partial(self._handle_argument, is_write=True, is_output=True),
+            outputs,
+        )
+
+
+class CUDASanitizerDispatchMode(TorchDispatchMode):
+    def __init__(self):
+        self.event_handler = EventHandler()
+        torch._C._activate_cuda_trace()
+        cuda_trace.register_callback_for_cuda_event_creation(
+            self.event_handler._handle_event_creation
+        )
+        cuda_trace.register_callback_for_cuda_event_deletion(
+            self.event_handler._handle_event_deletion
+        )
+        cuda_trace.register_callback_for_cuda_event_record(
+            self.event_handler._handle_event_record
+        )
+        cuda_trace.register_callback_for_cuda_event_wait(
+            self.event_handler._handle_event_wait
+        )
+        cuda_trace.register_callback_for_cuda_memory_allocation(
+            self.event_handler._handle_memory_allocation
+        )
+        cuda_trace.register_callback_for_cuda_memory_deallocation(
+            self.event_handler._handle_memory_deallocation
+        )
+        cuda_trace.register_callback_for_cuda_stream_creation(
+            self.event_handler._handle_stream_creation
+        )
+        cuda_trace.register_callback_for_cuda_device_synchronization(
+            self.event_handler._handle_device_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_stream_synchronization(
+            self.event_handler._handle_stream_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_event_synchronization(
+            self.event_handler._handle_event_synchronization
+        )
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        argument_handler = ArgumentHandler()
+        argument_handler.parse_inputs(func._schema, args, kwargs)
+
+        outputs = func(*args, **kwargs)
+
+        argument_handler.parse_outputs(outputs)
+        errors = self.event_handler._handle_kernel_launch(
+            torch.cuda.current_stream().cuda_stream,
+            argument_handler.dataptrs_read - argument_handler.dataptrs_written,
+            argument_handler.dataptrs_written,
+            argument_handler.outputs,
+            func._schema,
+            argument_handler.tensor_aliases,
+        )
+        if errors:
+            for error in errors:
+                print(error, file=sys.stderr)
+            raise CUDASanitizerErrors(errors)
+
+        return outputs
+
+
+class CUDASanitizer:
+    """Manages the lifetime of a CUDASanitizer dispatch mode object.
+
+    The CUDASanitizer class wraps the entering/exiting functions of the dispatch mode
+    context manager in the enable function/destructor, respectively. This is to
+    explicitly set the lifetime of the dispatch mode object to that of the application.
+    This approach was deemed more elegant than using the atexit module.
+    """
+
+    def __init__(self):
+        self.dispatch = CUDASanitizerDispatchMode()
+        self.enabled = False
+
+    def enable(self):
+        self.dispatch.__enter__()
+        self.enabled = True
+
+    def __del__(self):
+        if self.enabled:
+            self.dispatch.__exit__(None, None, None)
+
+
+def enable_cuda_sanitizer():
+    """Enable CUDA Sanitizer.
+
+    The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions
+    for synchronization errors. All data races found will be printed to the standard
+    error output along with stack traces of suspected causes. For best results, the
+    sanitizer should be enabled at the very beginning of the program.
+    """
+    cuda_sanitizer.enable()
+
+
+cuda_sanitizer = CUDASanitizer()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb9ee0705b4a8b3296e4717f78396cf8ae2b5ece
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea23c2072d86a61db643fcfbfb799e97267e5e9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/comm.py
@@ -0,0 +1,18 @@
+# The functions here have been moved to torch.nn.parallel.comm
+from torch.nn.parallel.comm import (
+    broadcast,
+    broadcast_coalesced,
+    gather,
+    reduce_add,
+    reduce_add_coalesced,
+    scatter,
+)
+
+__all__ = [
+    "broadcast",
+    "broadcast_coalesced",
+    "reduce_add",
+    "reduce_add_coalesced",
+    "scatter",
+    "gather",
+]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..60440c58dc1d057b744fc91a6254757b74839225
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/memory.py
@@ -0,0 +1,914 @@
+r"""This package adds support for device memory management implemented in CUDA."""
+
+import collections
+import contextlib
+import ctypes
+import pickle
+import sys
+import warnings
+from inspect import signature
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import _C
+
+from torch.types import Device
+from .._utils import _dummy_type
+from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized
+
+from ._memory_viz import memory as _memory, segments as _segments
+
+__all__ = [
+    "caching_allocator_alloc",
+    "caching_allocator_delete",
+    "set_per_process_memory_fraction",
+    "empty_cache",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+    "reset_max_memory_allocated",
+    "reset_max_memory_cached",
+    "memory_allocated",
+    "max_memory_allocated",
+    "memory_reserved",
+    "max_memory_reserved",
+    "memory_cached",
+    "max_memory_cached",
+    "memory_snapshot",
+    "memory_summary",
+    "list_gpu_processes",
+    "mem_get_info",
+    "get_allocator_backend",
+    "CUDAPluggableAllocator",
+    "change_current_allocator",
+]
+
+
+if not hasattr(torch._C, "_cuda_CUDAAllocator"):
+    # Define dummy base classes
+    torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator")
+
+
+def _host_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaHostAllocator()
+
+
+@contextlib.contextmanager
+def _free_mutex():
+    torch._C._cuda_lock_mutex()
+    try:
+        yield
+    finally:
+        torch._C._cuda_unlock_mutex()
+
+
+def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
+    r"""Perform a memory allocation using the CUDA memory allocator.
+
+    Memory is allocated for a given device and a stream, this
+    function is intended to be used for interoperability with other
+    frameworks. Allocated memory is released through
+    :func:`~torch.cuda.caching_allocator_delete`.
+
+    Args:
+        size (int): number of bytes to be allocated.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+        stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
+            the default stream for the selected device is used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if stream is None:
+        stream = torch.cuda.current_stream(device)
+    if isinstance(stream, torch.cuda.streams.Stream):
+        stream = stream.cuda_stream
+    if not isinstance(stream, int):
+        raise TypeError(
+            "Invalid type for stream argument, must be "
+            "`torch.cuda.Stream` or `int` representing a pointer "
+            "to a existing stream"
+        )
+    with torch.cuda.device(device):
+        return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
+
+
+def caching_allocator_delete(mem_ptr):
+    r"""Delete memory allocated using the CUDA memory allocator.
+
+    Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
+    is freed here. The associated device and stream are tracked inside
+    the allocator.
+
+    Args:
+        mem_ptr (int): memory address to be freed by the allocator.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
+
+
+def set_per_process_memory_fraction(
+    fraction, device: Union[Device, int] = None
+) -> None:
+    r"""Set memory fraction for a process.
+
+    The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
+    The allowed value equals the total visible memory multiplied fraction.
+    If trying to allocate more than the allowed value in a process, will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+    .. note::
+        In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if not isinstance(fraction, float):
+        raise TypeError("Invalid type for fraction argument, must be `float`")
+    if fraction < 0 or fraction > 1:
+        raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1")
+
+    torch._C._cuda_setMemoryFraction(fraction, device)
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU application and visible in
+    `nvidia-smi`.
+
+    .. note::
+        :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of GPU memory in certain cases. See :ref:`cuda-memory-management` for
+        more details about GPU memory management.
+    """
+    if is_initialized():
+        torch._C._cuda_emptyCache()
+
+
+def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return a dictionary of CUDA memory allocator statistics for a given device.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from ``cudaMalloc()``.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    In addition to the core statistics, we also provide some simple event
+    counters:
+
+    - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+
+    The caching allocator can be configured via ENV to not split blocks larger than a
+    defined size (see Memory Management section of the Cuda Semantics documentation).
+    This helps avoid memory fragmentation but may have a performance
+    penalty. Additional outputs to assist with tuning and evaluating impact:
+
+    - ``"max_split_size"``: blocks above this size will not be split.
+    - ``"oversize_allocations.{current,peak,allocated,freed}"``:
+      number of over-size allocation requests received by the memory allocator.
+    - ``"oversize_segments.{current,peak,allocated,freed}"``:
+      number of over-size reserved segments from ``cudaMalloc()``.
+
+    The caching allocator can be configured via ENV to round memory allocations in order
+    to reduce fragmentation. Sometimes the overhead from rounding can be higher than
+    the fragmentation it helps reduce. The following stat can be used to check if
+    rounding adds too much overhead:
+
+    - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      memory requested by client code, compare this with allocated_bytes to check if
+      allocation rounding adds too much overhead.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+
+    .. note::
+        With :ref:`backend:cudaMallocAsync<cuda-memory-envvars>`, some stats are not
+        meaningful, and are always reported as zero.
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix, obj):
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_memoryStats(device)
+
+
+def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
+    the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
+    `"num_alloc_retries"` and `"num_ooms"`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetAccumulatedMemoryStats(device)
+
+
+def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "peak" stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
+    `"peak"` key in each individual stat dict.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetPeakMemoryStats(device)
+
+
+def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
+
+    See :func:`~torch.cuda.max_memory_allocated` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
+
+    See :func:`~torch.cuda.max_memory_cached` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory occupied by tensors in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `nvidia-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU. See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
+    reset the starting point in tracking this metric. For example, these two
+    functions can measure the peak allocated memory usage of each iteration in a
+    training loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
+    the starting point in tracking this metric. For example, these two functions
+    can measure the peak cached memory amount of each iteration in a training
+    loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
+
+
+def memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
+        FutureWarning,
+    )
+    return memory_reserved(device=device)
+
+
+def max_memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
+        FutureWarning,
+    )
+    return max_memory_reserved(device=device)
+
+
+def memory_snapshot():
+    r"""Return a snapshot of the CUDA memory allocator state across all devices.
+
+    Interpreting the output of this function requires familiarity with the
+    memory allocator internals.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return torch._C._cuda_memorySnapshot()["segments"]
+
+
+def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
+    r"""Return a human-readable printout of the current memory allocator statistics for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+        abbreviated (bool, optional): whether to return an abbreviated summary
+            (default: False).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    stats = memory_stats(device=device)
+
+    def _format_size(sz, pref_sz):
+        prefixes = ["B  ", "KiB", "MiB", "GiB", "TiB", "PiB"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_sz < 768 * 1024:
+                break
+            prefix = new_prefix
+            sz //= 1024
+            pref_sz /= 1024
+        return f"{sz:6d} {prefix}"
+
+    def _format_count(cnt, pref_cnt):
+        prefixes = [" ", "K", "M"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_cnt < 750 * 1000:
+                break
+            prefix = new_prefix
+            cnt //= 1000
+            pref_cnt /= 1000
+        return f"{cnt:7d} {prefix} "
+
+    metrics_to_display = [
+        ("allocated_bytes", "Allocated memory", _format_size),
+        ("active_bytes", "Active memory", _format_size),
+        ("requested_bytes", "Requested memory", _format_size),
+        ("reserved_bytes", "GPU reserved memory", _format_size),
+        ("inactive_split_bytes", "Non-releasable memory", _format_size),
+        ("allocation", "Allocations", _format_count),
+        ("active", "Active allocs", _format_count),
+        ("segment", "GPU reserved segments", _format_count),
+        ("inactive_split", "Non-releasable allocs", _format_count),
+    ]
+
+    lines = []
+    lines.append("=" * 75)
+    lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
+    lines.append("-" * 75)
+    lines.append(
+        "  {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d}  "
+    )
+    lines.append("=" * 75)
+    lines.append(
+        "        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  "
+    )
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+        submetrics = [("all", metric_name)]
+        if not abbreviated:
+            submetrics.append(("large_pool", "      from large pool"))
+            submetrics.append(("small_pool", "      from small pool"))
+
+        current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
+            None,
+            None,
+            None,
+            None,
+        )
+
+        for submetric_key, submetric_name in submetrics:
+            prefix = metric_key + "." + submetric_key + "."
+
+            current = stats[prefix + "current"]
+            peak = stats[prefix + "peak"]
+            allocated = stats[prefix + "allocated"]
+            freed = stats[prefix + "freed"]
+
+            if current_prefval is None:
+                current_prefval = current
+                peak_prefval = peak
+                allocated_prefval = allocated
+                freed_prefval = freed
+
+            lines.append(
+                " {:<21} | {} | {} | {} | {} ".format(
+                    submetric_name,
+                    formatter(current, current_prefval),
+                    formatter(peak, peak_prefval),
+                    formatter(allocated, allocated_prefval),
+                    formatter(freed, freed_prefval),
+                ),
+            )
+
+    metrics_to_display = [
+        ("oversize_allocations", "Oversize allocations", _format_count),
+        ("oversize_segments", "Oversize GPU segments", _format_count),
+    ]
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+
+        prefix = metric_key + "."
+
+        current = stats[prefix + "current"]
+        peak = stats[prefix + "peak"]
+        allocated = stats[prefix + "allocated"]
+        freed = stats[prefix + "freed"]
+
+        lines.append(
+            " {:<21} | {} | {} | {} | {} ".format(
+                metric_name,
+                formatter(current, current),
+                formatter(peak, peak),
+                formatter(allocated, allocated),
+                formatter(freed, freed),
+            ),
+        )
+
+    lines.append("=" * 75)
+
+    fmt_dict = {"_": "", "device": device}
+    for k, v in stats.items():
+        fmt_dict[k.replace(".", "-")] = v
+    return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
+
+
+def list_gpu_processes(device: Union[Device, int] = None) -> str:
+    r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    try:
+        import pynvml  # type: ignore[import]
+    except ModuleNotFoundError:
+        return "pynvml module not found, please install pynvml"
+    from pynvml import NVMLError_DriverNotLoaded
+
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded:
+        return "cuda driver can't be loaded, is cuda enabled?"
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    lines = []
+    lines.append(f"GPU:{device}")
+    if len(procs) == 0:
+        lines.append("no processes are running")
+    for p in procs:
+        mem = p.usedGpuMemory / (1024 * 1024)
+        lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
+    return "\n".join(lines)
+
+
+def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
+    r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    return torch.cuda.cudart().cudaMemGetInfo(device)
+
+
+def _record_memory_history_legacy(
+    enabled: bool,
+    record_context=True,
+    trace_alloc_max_entries=1,
+    trace_alloc_record_context=False,
+    device: Union[Device, int] = None,
+    record_context_cpp=False,
+):
+    _C._cuda_record_memory_history_legacy(
+        enabled,
+        record_context,
+        trace_alloc_max_entries,
+        trace_alloc_record_context,
+        record_context_cpp,
+    )
+
+
+def _record_memory_history(enabled="all", *args, **kwargs):
+    """Enable recording of stack traces associated with memory
+    allocations, so you can tell what allocated any piece of memory in
+    :func:`torch.cuda.memory._snapshot()`.
+
+    In addition too keeping stack traces with each current allocation and free,
+    this will also enable recording of a history of all alloc/free events.
+
+    Use :func:`torch.cuda.memory._snapshot()` to retrieve this information,
+    and the tools in `_memory_viz.py` to visualize snapshots.
+
+    The Python trace collection is fast (2us per trace), so you may consider
+    enabling this on production jobs if you anticipate ever having to debug
+    memory issues.
+
+    C++ trace collection is also fast (~50ns/frame), which for many typical programs
+    works out to ~2us per trace, but can vary depending on stack depth.
+
+    Args:
+        enabled (Literal[None, "state", "all"], optional):
+            `None`, disable recording memory history.
+            `"state"`, keep information for currenly allocated memory.
+            `"all"`, additionally keep a history of all alloc/free calls.
+            Defaults to "all".
+        context (Literal[None, "state", "alloc", "all"], optional):
+            `None`, Do not record any tracebacks.
+            `"state"`, Record tracebacks for currently allocated memory.
+            `"alloc"`, additionally keep tracebacks for alloc calls.
+            `"all"`, additionally keep tracebacks for free calls.
+            Defaults to "all".
+        stacks (Literal["python", "all"], optional):
+            `"python"`, include Python, TorchScript, and inductor frames in tracebacks
+            `"all"`, additionally include C++ frames
+            Defaults to "all".
+        max_entries (int, optional): Keep a maximum of `max_entries`
+            alloc/free events in the recorded history recorded.
+    """
+    if isinstance(enabled, bool):
+        return _record_memory_history_legacy(enabled, *args, **kwargs)
+    else:
+        return _record_memory_history_impl(enabled, *args, **kwargs)
+
+
+def _record_memory_history_impl(
+    enabled: Optional[str] = "all",
+    context: Optional[str] = "all",
+    stacks: str = "all",
+    max_entries: int = sys.maxsize,
+    device: Union[Device, int] = None,
+):
+    _C._cuda_record_memory_history(enabled, context, stacks, max_entries)
+
+
+_record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]
+
+
+def _snapshot(device: Union[Device, int] = None):
+    """Save a snapshot of CUDA memory state at the time it was called.
+
+    The state is represented as a dictionary with the following structure.
+
+    .. code-block:: python
+
+        class Snapshot(TypedDict):
+            segments : List[Segment]
+            device_traces: List[List[TraceEntry]]
+
+        class Segment(TypedDict):
+            # Segments are memory returned from a cudaMalloc call.
+            # The size of reserved memory is the sum of all Segments.
+            # Segments are cached and reused for future allocations.
+            # If the reuse is smaller than the segment, the segment
+            # is split into more then one Block.
+            # empty_cache() frees Segments that are entirely inactive.
+            address: int
+            total_size: int #  cudaMalloc'd size of segment
+            stream: int
+            segment_type: Literal['small', 'large'] # 'large' (>1MB)
+            allocated_size: int # size of memory in use
+            active_size: int # size of memory in use or in active_awaiting_free state
+            blocks : List[Block]
+
+        class Block(TypedDict):
+            # A piece of memory returned from the allocator, or
+            # current cached but inactive.
+            size: int
+            requested_size: int # size requested during malloc, may be smaller than
+                                # size due to rounding
+            address: int
+            state: Literal['active_allocated', # used by a tensor
+                        'active_awaiting_free', # waiting for another stream to finish using
+                                                # this, then it will become free
+                        'inactive',] # free for reuse
+            frames: List[Frame] # stack trace from where the allocation occurred
+
+        class Frame(TypedDict):
+                filename: str
+                line: int
+                name: str
+
+        class TraceEntry(TypedDict):
+            # When `torch.cuda.memory._record_memory_history()` is enabled,
+            # the snapshot will contain TraceEntry objects that record each
+            # action the allocator took.
+            action: Literal[
+            'alloc'  # memory allocated
+            'free_requested', # the allocated received a call to free memory
+            'free_completed', # the memory that was requested to be freed is now
+                            # able to be used in future allocation calls
+            'segment_alloc', # the caching allocator ask cudaMalloc for more memory
+                            # and added it as a segment in its cache
+            'segment_free',  # the caching allocator called cudaFree to return memory
+                            # to cuda possibly trying free up memory to
+                            # allocate more segments or because empty_caches was called
+            'oom',          # the allocator threw an OOM exception. 'size' is
+                            # the requested number of bytes that did not succeed
+            'snapshot'      # the allocator generated a memory snapshot
+                            # useful to coorelate a previously taken
+                            # snapshot with this trace
+            ]
+            addr: int # not present for OOM
+            frames: List[Frame]
+            size: int
+            stream: int
+            device_free: int # only present for OOM, the amount of
+                            # memory cuda still reports to be free
+
+    Returns:
+        The Snapshot dictionary object
+    """
+    return _C._cuda_memorySnapshot()
+
+
+def _dump_snapshot(filename="dump_snapshot.pickle"):
+    """
+    Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
+
+    This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
+
+    Args:
+        filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
+    """
+    s = _snapshot()
+    with open(filename, "wb") as f:
+        pickle.dump(s, f)
+
+
+def _save_segment_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_segments(snapshot))
+
+
+def _save_memory_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_memory(snapshot))
+
+
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
+
+
+def get_allocator_backend() -> str:
+    r"""Return a string describing the active allocator backend as set by
+    ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are
+    ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync``
+    (CUDA's built-in asynchronous allocator).
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on choosing the allocator backend.
+    """
+    return torch._C._cuda_getAllocatorBackend()
+
+
+class _CUDAAllocator:
+    r"""Wrapper over internal CUDA memory allocators."""
+
+    def __init__(self, allocator: torch._C._cuda_CUDAAllocator):
+        self._allocator = allocator
+
+    def allocator(self):
+        return self._allocator
+
+
+class CUDAPluggableAllocator(_CUDAAllocator):
+    r"""CUDA memory allocator loaded from a so file."""
+
+    def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
+        r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes.
+
+        To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function.
+
+        Args:
+            path_to_so_file(str): Path in the filesystem to the `.so` file containing
+                the allocator functions
+            alloc_fn_name(str): Name of the function to perform the memory allocation
+                in the so file. The signature must be:
+                void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream);
+            free_fn_name(str): Name of the function to perform the memory release
+                in the so file. The signature must be:
+                void free_fn_name(void* ptr, size_t size, cudaStream_t stream);
+
+        .. warning::
+            This is currently supported only in unix OSs
+
+        .. note::
+            See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+        """
+        allocator = ctypes.CDLL(path_to_so_file)
+        alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
+        free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
+        assert alloc_fn is not None
+        assert free_fn is not None
+        self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn)
+
+
+def change_current_allocator(allocator: _CUDAAllocator) -> None:
+    r"""Change the currently used memory allocator to be the one provided.
+
+    If the current allocator has already been used/initialized, this function will error.
+
+
+    Args:
+        allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one.
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    torch._C._cuda_changeCurrentAllocator(allocator.allocator())
+
+
+def _get_current_allocator() -> _CUDAAllocator:
+    r"""Return the allocator being currently used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    return _CUDAAllocator(torch._C._cuda_getAllocator())
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cceaa42fd40faee5b2edc08bd0971a003eef434e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_compatibility.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a610ac0386e856f0cb8d45596c9216381063379a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80f345d9f55c6cf2b28038dc3a9f537758c190b6
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/annotate.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a9d1734665287fde13236e8937c67a054264b4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/config.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0ea80f6ddf1f353c9d65b9611a34eb96cd915e1
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/node.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa49fe2a3fbf0d33768b475eaf6d70ced19e43b6
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/traceback.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ef41bb4a298c12acfd5f112bdec5ff17391c22e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9f6b5b9c34c92ecca468f34d393c74b7022a4c4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07b94a049216103d8fd9b66c57b802701db3afa2
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e99a6c7683bee805285ad20b5e027209c9aea513
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e05f43ec71663ccc219e1cff260122a59f04d6e7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e199b0e262bd724f4786adacd3a79b7e0b2b385
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6595359d20574b5ef4d99297044415e60396ba5b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d423de930dc7e8480a4f629ebc542782b0576b21
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/backends/cudagraphs.py
@@ -0,0 +1,56 @@
+import torch
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.utils import _pytree as pytree
+
+import operator
+
+class CudaGraphsSupport(OperatorSupport):
+    # TODO: why is submodules passed here
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        if node.op not in CALLABLE_NODE_OPS:
+            return False
+
+        if node.target in [torch.ops.aten.embedding_dense_backward.default]:
+            return False
+
+        if node.target in [operator.getitem]:
+            return True
+
+        found_not_cuda = False
+
+        def meta_fk(meta):
+            return meta["val"] if "val" in meta else meta["fake_result"]
+
+        def find_not_cuda(t):
+            nonlocal found_not_cuda
+            if isinstance(t, torch.Tensor) and t.device.type != 'cuda':
+                found_not_cuda = True
+
+        for n in node.all_input_nodes:
+            pytree.tree_map_(find_not_cuda, meta_fk(n.meta))
+
+        pytree.tree_map_(find_not_cuda, meta_fk(node.meta))
+
+        # NB: factory function is accounted for because the result would be
+        # cpu or cuda
+
+        return not found_not_cuda
+
+def partition_cudagraphs(gm, inputs):
+    """
+    Partition an FX graph into sub-GraphModules that can be validly run under
+    CUDA graphs.  For a subgraph to be runnable under CUDA, all of the operations
+    must involve CUDA tensors only/
+    """
+
+    FakeTensorProp(gm).propagate(*inputs)
+    supported_ops = CudaGraphsSupport()
+    # TODO: single node partition may be wrong due to the pessimization
+    # from copying in and out the data.  Check in benchmarks, perhaps
+    partitioner = CapabilityBasedPartitioner(gm, supported_ops, allows_single_node_partition=True)
+    partitions = partitioner.propose_partitions()
+    fused_graph = partitioner.fuse_partitions(partitions)
+    return fused_graph
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..472c5b4f2ac04ad832913d974333d86fd827a272
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06ae7e3164f22c056bf9b810086305bdb97dddfb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d0ed6c0738e1c90d5360ca5593a5bbdd268afea
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..60ed6671179b2c20fa0be176631d1415009ee87a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/tests/test_pass_manager.py
@@ -0,0 +1,58 @@
+import unittest
+
+from ..pass_manager import (
+    inplace_wrapper,
+    PassManager,
+    these_before_those_pass_constraint,
+    this_before_that_pass_constraint,
+)
+
+
+class TestPassManager(unittest.TestCase):
+    def test_pass_manager_builder(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+        pm.validate()
+
+    def test_this_before_that_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+
+        # add unfulfillable constraint
+        pm.add_constraint(this_before_that_pass_constraint(passes[-1], passes[0]))
+
+        self.assertRaises(RuntimeError, pm.validate)
+
+    def test_these_before_those_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        constraint = these_before_those_pass_constraint(passes[-1], passes[0])
+        pm = PassManager(
+            [inplace_wrapper(p) for p in passes]
+        )
+
+        # add unfulfillable constraint
+        pm.add_constraint(constraint)
+
+        self.assertRaises(RuntimeError, pm.validate)
+
+    def test_two_pass_managers(self) -> None:
+        """Make sure we can construct the PassManager twice and not share any
+        state between them"""
+
+        passes = [lambda x: 2 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm1 = PassManager()
+        for p in passes:
+            pm1.add_pass(p)
+        pm1.add_constraint(constraint)
+        output1 = pm1(1)
+        self.assertEqual(output1, 2 ** 3)
+
+        passes = [lambda x: 3 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm2 = PassManager()
+        for p in passes:
+            pm2.add_pass(p)
+        pm2.add_constraint(constraint)
+        output2 = pm2(1)
+        self.assertEqual(output2, 3 ** 3)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a7970ba4c283e851430ed0025e1ed5c772eb7b1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__init__.py
@@ -0,0 +1 @@
+from .common import lift_subgraph_as_module, HolderModule, compare_graphs
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d9a85eb023713d04f82f8b86758454941608283
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..642c0de09f7ecee8923217939ee566ddb14ffc0d
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ffb9f6f04dc0a813730bd986818e2446f5b40a0
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fb328e8596d50bd3d996ff2b2948b1e83c7f05f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/fuser_utils.py
@@ -0,0 +1,233 @@
+import copy
+from queue import SimpleQueue
+from typing import List, Dict, Tuple
+
+import torch.fx
+from torch.fx.graph_module import GraphModule
+from torch.fx.graph import Graph
+from torch.fx.node import Node
+from torch.fx.passes.tools_common import NodeList, NodeSet, legalize_graph
+from torch.fx.passes.utils import lift_subgraph_as_module
+from torch.fx._compatibility import compatibility
+
+@compatibility(is_backward_compatible=False)
+def topo_sort(nodes: NodeList) -> NodeList:
+    # sort nodes according to the topological order
+    indegree_map = dict.fromkeys(nodes, 0)
+    candidates: SimpleQueue = SimpleQueue()
+
+    for node in nodes:
+        for n in node.all_input_nodes:
+            if n in indegree_map:
+                indegree_map[node] += 1
+        if indegree_map[node] == 0:
+            candidates.put(node)
+
+    sorted_nodes: NodeList = list()
+    while not candidates.empty():
+        node = candidates.get()
+        sorted_nodes.append(node)
+
+        for n in node.users:
+            if n in indegree_map:
+                indegree_map[n] -= 1
+                if indegree_map[n] == 0:
+                    candidates.put(n)
+
+    assert len(nodes) == len(sorted_nodes), "topological sorted nodes doesn't have same length as input nodes"
+
+    return sorted_nodes
+
+
+@compatibility(is_backward_compatible=False)
+def validate_partition(partition: NodeList) -> bool:
+    # verify the partition does't form a dependency cycle in the original graph
+    # returns True for valid partition, False for invalid
+
+    partition_set = set(partition)
+
+    outputs: NodeList = list()
+    for node in partition_set:
+        for user_node in node.users:
+            if user_node not in partition_set:
+                # external user node, need to expose as an output
+                outputs.append(user_node)
+
+    # Perform BFS on the partition outputs.
+    # If it reaches a node within the partition, then it found a cycle.
+    # This function takes the ownership of `root_nodes` and may modify it.
+    def bfs_find_cycle(root_nodes: NodeList) -> bool:
+        # Set used to exclude nodes that have already been visited.
+        # If a node has been visited, that node and all its children have
+        # been checked for cycles.
+        visited: NodeSet = set()
+
+        # Start with `root_nodes` and traverse through (toward child nodes)
+        # their connected sub-graph. Nodes in `visited` won't be added
+        # to `queue` again.
+        queue: NodeList = root_nodes
+        while queue:
+            current = queue.pop()
+            visited.add(current)
+            if current in partition_set:
+                # Started from partition's `output` nodes, and reached
+                # another node in partition. Cycle!
+                return True
+            for user_node in current.users:
+                if user_node in visited:
+                    continue
+                queue.append(user_node)
+        # `root_nodes` don't cause cycle.
+        return False
+
+    # Use all output nodes as roots to traverse
+    # the graph to check cycles.
+    if bfs_find_cycle(outputs):
+        return False
+
+    return True
+
+
+@compatibility(is_backward_compatible=False)
+def fuse_as_graphmodule(gm: GraphModule,
+                        nodes: NodeList,
+                        module_name: str) -> Tuple[GraphModule, Tuple[Node, ...], Tuple[Node, ...]]:
+
+    """
+    Fuse nodes in graph_module into a GraphModule.
+
+    Args:
+        gm (GraphModule): target graph_module
+
+        nodes (List[Node]): list of nodes in `gm` to fuse, where the node must be topologically sorted
+
+        module_name: class name for the fused GraphModule
+
+    Returns:
+        fused_gm (GraphModule): fused graph module, where its node is a copy of `nodes` in `gm`
+
+        original_inputs (Tuple[Node, ...]): input nodes to `nodes` in original `gm`
+
+        original_outputs (Tuple[Node, ...]): consumer nodes of `nodes` in original `gm`
+
+    """
+
+    # assumption: nodes are already sorted in topo order
+
+    for node in nodes:
+        assert node.graph.owning_module is gm, f"{node} doesn't belong to passed in graph module {gm._get_name()}"
+        assert not node._erased, f"{node} has been removed from owning graph"
+        assert node in gm.graph.nodes, f"{node} is not found in graph module {gm._get_name()}"
+
+    # validates partition doesn't introduce dependency circles in the graph
+    assert validate_partition(nodes), "Invalid partition, found dependency cycles"
+
+    subgraph = Graph()
+
+    node_to_placeholder: Dict[Node, Node] = {}  # mapping of nodes from old graph to placeholder in new graph
+    node_map: Dict[Node, Node] = {}       # mapping of nodes from old graph to new graph
+
+    # handles inputs through graph.node_copy's arg_transform functions
+    def remap_inputs(x):
+        if x.op == "get_attr":
+            # TODO: do we really need copy the get_attr node into the graph?
+            # do something here
+            pass
+
+        if x in nodes:
+            # x is inside subgraph, return the copied node
+            # the node should have been copied aleady, as we are copying graph in the topological order
+            return node_map[x]
+
+        if x not in node_to_placeholder:
+            # x is not in subgraph, create a new placeholder for subgraph
+            placeholder_node = subgraph.placeholder(x.name, type_expr=x.type)
+            # copy all meta fields, even if some fields might be irrelvant for the placeholder node
+            placeholder_node.meta = copy.copy(x.meta)
+            node_to_placeholder[x] = placeholder_node
+
+        return node_to_placeholder[x]
+
+    # copy nodes in topological order
+    for node in nodes:
+        new_node = subgraph.node_copy(node, remap_inputs)
+        node_map[node] = new_node
+
+    # handles outputs
+    output_mapping: Dict[Node, Node] = {}  # mapping from old output to new outputs
+
+    for node in nodes:
+        for user_node in node.users:
+            if user_node not in nodes:
+                # external user node, need to expose as an output
+                output_mapping[node] = node_map[node]
+
+    # outs contain nodes in the new subgraph
+    outs = tuple(output_mapping.values())
+
+    # Take care of the args of FX output node. If there's a single
+    # output then the output node args is like (output_single), else
+    # if there're multiple outputs then the output node args is like
+    # ((output_0, output_1, ...)).
+    subgraph.output(outs[0] if len(outs) == 1 else outs)
+
+    # lint to ensure correctness
+    subgraph.lint()
+    fused_gm: GraphModule
+    fused_gm, _ = lift_subgraph_as_module(gm, subgraph, comp_name="", class_name=module_name)
+
+    # sub_gm's input nodes in the original module
+    original_inputs: Tuple[Node, ...] = tuple(node_to_placeholder.keys())
+
+    # sub_gm's outputs node in the original module
+    original_outputs: Tuple[Node, ...] = tuple(output_mapping.keys())
+
+    return fused_gm, original_inputs, original_outputs
+
+
+@compatibility(is_backward_compatible=False)
+def insert_subgm(gm: GraphModule, sub_gm: GraphModule, orig_inputs: Tuple[Node, ...], orig_outputs: Tuple[Node, ...]):
+    # add sub_gm into gm
+    submodule_name = sub_gm.__class__.__name__
+    gm.add_submodule(submodule_name, sub_gm)
+
+    # Create a call_module node in main graph.
+    module_node = gm.graph.call_module(
+        submodule_name,
+        args=orig_inputs,
+        kwargs=None)
+
+    if len(orig_outputs) == 1:
+        # main_remapping[comp.orig_outputs[0]] = module_node
+        orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True)
+    else:
+        for i, orig_output in enumerate(orig_outputs):
+            # Use Proxy to record getitem access.
+            proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
+            orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
+    return gm
+
+@compatibility(is_backward_compatible=False)
+def erase_nodes(gm: GraphModule, nodes: NodeList):
+
+    # erase original nodes in inversed topological order
+    for node in reversed(nodes):
+        gm.graph.erase_node(node)
+
+
+@compatibility(is_backward_compatible=False)
+def fuse_by_partitions(gm: GraphModule, partitions: List[NodeList]) -> GraphModule:
+    for partition_id, nodes in enumerate(partitions):
+        sorted_nodes = topo_sort(nodes)
+
+        submodule_name = "fused_" + str(partition_id)
+        sub_gm, orig_inputs, orig_outputs = fuse_as_graphmodule(gm, sorted_nodes, submodule_name)
+
+        insert_subgm(gm, sub_gm, orig_inputs, orig_outputs)
+
+        erase_nodes(gm, sorted_nodes)
+
+    # topological sort original gm with newly created sub_gm
+    legalize_graph(gm)
+
+    return gm
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27fe5599639e9f5235b57dcdc94c71d93019e87
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
@@ -0,0 +1,113 @@
+from typing import Dict, List, Tuple
+
+from torch.fx import Graph, GraphModule, Node
+
+from torch.fx._compatibility import compatibility
+from .matcher_utils import InternalMatch, SubgraphMatcher
+
+__all__ = ["SubgraphMatcherWithNameNodeMap"]
+
+
+def _split_to_graph_and_name_node_map(
+    gm: GraphModule,
+) -> Tuple[GraphModule, Dict[str, Node]]:
+    from torch.fx.graph import _PyTreeInfo
+    from torch.utils._pytree import tree_flatten, tree_unflatten
+
+    name_node_map = {}
+    for n in gm.graph.nodes:
+        if n.op == "output":
+            assert gm._out_spec is not None
+            output = tree_unflatten(n.args[0], gm._out_spec)
+            assert isinstance(
+                output, tuple
+            ), "Expecting the pattern graph to return a tuple"
+            assert (
+                len(output) >= 2
+            ), "Expecting the pattern graph to have at least two outputs"
+            *out, name_node_map = output
+            flattened, out_spec = tree_flatten(out)
+            assert isinstance(
+                name_node_map, Dict
+            ), "Expecting the input graph to have a dict output as the last element"
+            n.args = (flattened,)
+            orig_pytree_info = gm._graph._codegen.pytree_info
+            gm._graph._codegen.pytree_info = _PyTreeInfo(
+                orig_pytree_info.orig_args, orig_pytree_info.in_spec, out_spec
+            )
+    gm.recompile()
+    return gm, name_node_map
+
+
+@compatibility(is_backward_compatible=False)
+class SubgraphMatcherWithNameNodeMap(SubgraphMatcher):
+    """Extends SubgraphMatcher to support querying the matched subgraph nodes through node name,
+    this requires pattern to have specific format (returning and additional dictionary at the output,
+    that has node name as key, and the node in the pattern graph as value, see Example for more details)
+
+    Difference with SubgraphMatcher is that it takes a `pattern_gm` GraphModule as input during
+    initialization since we need to modify the graph (which requires `recompile` the GraphModule)
+
+    Example::
+        def pattern(x, weight):
+            conv = F.conv2d(x, weight)
+            relu = F.relu(conv)
+            return relu, {"conv": conv, "relu": relu}
+
+        def target_graph(x, weight):
+            conv = F.conv2d(x, weight)
+            relu = F.relu(conv)
+            relu *= 2
+            return relu
+
+        pattern_gm = capture_pre_autograd_graph(pattern, example_inputs)
+        target_gm = capture_pre_autograd_graph(target_graph, example_inputs)
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        matches = matcher.match(target_gm)
+        for match in matches:
+            match.name_node_map["conv"].meta["annotation"] = ...
+
+    """
+
+    def __init__(
+        self,
+        pattern_gm: GraphModule,
+        match_output: bool = False,
+        match_placeholder: bool = False,
+        remove_overlapping_matches: bool = True,
+        ignore_literals: bool = False,
+    ) -> None:
+        pattern_gm, name_node_map = _split_to_graph_and_name_node_map(pattern_gm)
+        self.name_node_map = name_node_map
+        super().__init__(
+            pattern_gm.graph,
+            match_output,
+            match_placeholder,
+            remove_overlapping_matches,
+            ignore_literals,
+        )
+
+    def match(self, graph: Graph) -> List[InternalMatch]:
+        """The returned InternalMatch will have name_node_map populated with a map
+        from node name (str) to the target node, e.g.
+        {"conv": target_conv_ndoe, "relu": target_relu_node}
+
+        this requires the pattern graph returns an additional
+        output of node name to node, e.g. instead of:
+        ```
+        def pattern(...):
+            ...
+            return relu
+        ```
+        we should do:
+        ```
+        def pattern(...):
+            ...
+            return relu, {"conv": conv, "relu": relu}
+        ``` instead
+        """
+        internal_matches = super().match(graph)
+        for internal_match in internal_matches:
+            for k, n in self.name_node_map.items():
+                internal_match.name_node_map[k] = internal_match.nodes_map[n]
+        return internal_matches
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c524ef97c475a0529b7b18c430be0d39c350aa4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUApplyUtils.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <ATen/CollapseDims.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+#include <cstring>
+#include <limits>
+
+namespace at {
+
+/*
+ * The basic strategy for apply is as follows:
+ *
+ * 1. Starting with the outermost index, loop until we reach a dimension where
+ * the data is no longer contiguous, i.e. the stride at that dimension is not
+ * equal to the size of the tensor defined by the outer dimensions. Let's call
+ * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then
+ * A is equal to the entire Tensor. Let's call the inner tensor B.
+ *
+ * 2. We loop through the indices in B, starting at its outermost dimension. For
+ * example, if B is a 2x2 matrix, then we do:
+ *
+ * B[0][0]
+ * B[0][1]
+ * B[1][0]
+ * B[1][1]
+ *
+ * We set the offset into the underlying storage as (storageOffset + stride_B *
+ * index_B), i.e. basically we compute the offset into the storage as we would
+ * normally for a Tensor. But because we are guaranteed the subsequent data is
+ * contiguous in memory, we can simply loop for sizeof(A) iterations and perform
+ * the operation, without having to follow the order described by the strides of
+ * A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in
+ * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor,
+ * then the first two dimensions can be merged for the purposes of APPLY,
+ * reducing the number of nested loops.
+ */
+
+inline Tensor sort_strides(Tensor& tensor_) {
+  IntArrayRef strides = tensor_.strides();
+  std::vector<int64_t> indices;
+  indices.reserve(tensor_.ndimension());
+  for (const auto i : c10::irange(tensor_.ndimension())) {
+    indices.push_back(i);
+  }
+  std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
+    return strides[i1] > strides[i2];
+  });
+  Tensor tensor = tensor_.permute(indices);
+  return tensor;
+}
+
+template <typename T, int N>
+struct strided_tensor_iter_fixed {
+ public:
+  T* data_ = NULL;
+  int64_t dim_ = 0;
+
+  int64_t counter_[N] = {0};
+  int64_t sizes_[N] = {0};
+  int64_t strides_[N] = {0};
+
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
+  void operator=(strided_tensor_iter_fixed const& x) = delete;
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
+  strided_tensor_iter_fixed(
+      Tensor& tensor,
+      C10_UNUSED bool sort_strides = false)
+      : data_(tensor.data_ptr<T>()) {
+    std::memset(counter_, 0, sizeof(int64_t) * N);
+    if (tensor.dim() > 0) {
+      std::memcpy(
+          sizes_, tensor.sizes().data(), tensor.dim() * sizeof(int64_t));
+      std::memcpy(
+          strides_, tensor.strides().data(), tensor.dim() * sizeof(int64_t));
+    }
+    dim_ = std::get<1>(collapse_dims(sizes_, strides_, tensor.ndimension()));
+  }
+};
+
+template <typename T>
+struct strided_tensor_iter {
+ private:
+ public:
+  T* data_ = NULL;
+  int64_t dim_;
+
+  std::vector<int64_t> counter_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  strided_tensor_iter(strided_tensor_iter const&) = delete;
+  void operator=(strided_tensor_iter const& x) = delete;
+  strided_tensor_iter(strided_tensor_iter&&) = default;
+  strided_tensor_iter(Tensor& tensor)
+      : data_(tensor.data_ptr<T>()),
+        dim_(tensor.ndimension()),
+        counter_(dim_, 0),
+        sizes_(tensor.sizes().vec()),
+        strides_(tensor.strides().vec()) {
+    dim_ = std::get<1>(collapse_dims(sizes_.data(), strides_.data(), dim_));
+  }
+};
+
+inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
+  if (tensors.empty())
+    return true;
+  int64_t all_numel = tensors[0].numel();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (tensors[i].numel() != all_numel)
+      return false;
+  }
+  return true;
+}
+
+inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
+  std::ostringstream oss;
+  oss << "inconsistent tensor size, expected ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].sizes() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].sizes()
+      << " to have the same number of elements, but got ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].numel() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].numel()
+      << " elements respectively";
+  return oss.str();
+}
+
+inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
+  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
+  checkLayout("CPU_tensor_apply", tensors, kStrided);
+  if (!_all_equal_numel(tensors))
+    AT_ERROR(_all_equal_numel_error(tensors));
+  // An empty tensor has no elements
+  for (auto& t : tensors)
+    if (t.numel() == 0)
+      return false;
+  return true;
+}
+
+inline int64_t _max_dim_tensors(ArrayRef<Tensor> tensors) {
+  int64_t dim = 0;
+  for (auto& t : tensors)
+    dim = std::max(dim, t.ndimension());
+  return dim;
+}
+
+inline void iterate(int64_t /*size*/){};
+
+template <typename Arg, typename... Args>
+inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
+  iter.counter_[iter.dim_ - 1] += size;
+  iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1];
+  iterate(size, iter_tail...);
+}
+
+inline bool iterate_continue() {
+  return true;
+};
+
+template <typename Arg, typename... Args>
+inline bool iterate_continue(Arg& iter, Args&... iter_tail) {
+  return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] &&
+      iterate_continue(iter_tail...);
+}
+
+inline int64_t max_iterate_size() {
+  return std::numeric_limits<int64_t>::max();
+};
+
+template <typename Arg, typename... Args>
+inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) {
+  return std::min(
+      (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]),
+      max_iterate_size(iter_tail...));
+}
+
+inline void iterate_overflow(){};
+
+template <typename Arg, typename... Args>
+inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
+  if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) {
+    for (int64_t i = iter.dim_ - 1; i > 0; i--) {
+      if (iter.counter_[i] == iter.sizes_[i]) {
+        iter.counter_[i] = 0;
+        iter.counter_[i - 1]++;
+        iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) +
+            iter.strides_[i - 1];
+      }
+    }
+  }
+  iterate_overflow(iter_tail...);
+}
+
+inline void forward(int64_t /*offset*/){};
+
+template <typename Arg, typename... Args>
+inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
+  int64_t multi = offset;
+  for (int64_t i = iter.dim_ - 1; i >= 0; i--) {
+    int64_t inc = multi % iter.sizes_[i];
+    multi = multi / iter.sizes_[i];
+    iter.data_ = iter.data_ + inc * iter.strides_[i];
+    iter.counter_[i] += inc;
+  }
+  forward(offset, iter_tail...);
+}
+
+inline int64_t max_dim() {
+  return 0;
+}
+
+template <typename Arg, typename... Args>
+inline int64_t max_dim(Arg& iter, Args&... iter_tail) {
+  return std::max(iter.dim_, max_dim(iter_tail...));
+}
+
+inline void apply_op(){};
+
+template <typename Op, typename... Args>
+inline void apply_op(
+    int64_t numel,
+    int64_t offset,
+    const Op& op,
+    Args... iters) {
+  // For 0-dim tensors
+  if (numel == 1 && max_dim(iters...) == 0) {
+    op(*iters.data_...);
+    return;
+  }
+  if (offset > 0)
+    forward(offset, iters...);
+  // Splitting this into chunks helps the compiler create faster assembly
+  for (int64_t i = 0; i < numel;) {
+    for (; iterate_continue(iters...) && i < numel;) {
+      op(*iters.data_...);
+      iterate(1, iters...);
+      i++;
+    }
+    iterate_overflow(iters...);
+  }
+}
+
+/*
+  Apply a pointwise operator to sequence of tensors
+
+  The calling convention for op is a function/functor that takes the same
+  number of pointers of type scalar as the number of given tensors. For example,
+  to compute a = b * c, op would be of the form:
+  [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] =
+  b_val[0] * c_val[0]; };
+*/
+
+template <typename scalar1, typename scalar2, typename Op>
+inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2));
+  }
+}
+
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+inline void CPU_tensor_apply3(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3));
+  }
+}
+
+template <
+    typename scalar1,
+    typename scalar2,
+    typename scalar3,
+    typename scalar4,
+    typename Op>
+inline void CPU_tensor_apply4(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    Tensor tensor4,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3),
+        strided_tensor_iter_fixed<scalar4, 8>(tensor4));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3),
+        strided_tensor_iter<scalar4>(tensor4));
+  }
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..17c4ddd92f1d469abb771ed0392eed0df0508b1a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CPUFunctions_inl.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7d79fc715d6209920c6f3d4a2d02c7d8077b6bd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h
@@ -0,0 +1,542 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeexplicitautograd_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_adaptive_avg_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_add_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_aminmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_amp_update_scale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_assert_scalar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cdist_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cdist_forward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cholesky_solve_helper_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_chunk_cat_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_coalesce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_coalesced_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_physical_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_copy_from_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_copy_from_and_resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_ctc_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_ctc_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_ctc_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_init_dropout_state_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_dirichlet_grad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_dense_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_empty_affine_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_euclidean_dist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foobar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_abs_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_acos_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_add_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_addcmul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_asin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_ceil_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_clamp_max_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_clamp_min_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_cos_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_cosh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_div_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_erf_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_erfc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_exp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_expm1_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_frac_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_lerp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_lgamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log10_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_maximum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_minimum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_mul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_neg_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_pow_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_round_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_tan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_tanh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_trunc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_assert_scalar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_sym_constrain_range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_adam_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_adamw_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_dropout_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_sgd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fw_primal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_has_same_storage_numel_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_bin_edges_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_index_put_impl_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_is_all_true_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_is_any_true_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_linalg_check_errors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_lstm_mps_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_dual_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_dual_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_scale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_softmax_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mkldnn_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mps_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mps_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_multi_head_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_neg_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_neg_view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_from_padded_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_get_values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_strides_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nnpack_spatial_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pack_padded_sequence_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pdist_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pin_memory_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_print_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_reshape_alias_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_reshape_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_resize_output_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sample_dirichlet_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_segment_reduce_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_slow_conv2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_addmm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csr_prod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csr_sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_log_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_mask_projection_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_softmax_backward_data_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sparse_matmul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sum_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_spdiags_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_stack_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_functorch_fallback_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_filled_intlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_intlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_parallel_materialize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_warn_in_autograd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_dense_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_bsc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_bsr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_csc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_csr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_trilinear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_triton_multi_head_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unique_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unique2_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_index_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_index_put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/abs_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/add_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/addr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/affine_grid_generator_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/alias_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/alias_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/all_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/allclose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/any_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/arange_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/argsort_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/as_strided_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/as_strided_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bartlett_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_backward_elemt_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_backward_reduce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bernoulli_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bincount_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/binomial_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_and_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_or_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_xor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/blackman_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/block_diag_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bucketize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cauchy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ccol_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ccol_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/celu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/channel_shuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cholesky_solve_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/clone_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/col_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/col_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/complex_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conj_physical_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conv_tbc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_backward_overrideable_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_overrideable_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copy_sparse_to_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copysign_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/count_nonzero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/crow_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_add_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cummax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cummin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dequantize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/detach_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/detach_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/div_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dot_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_dense_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_renorm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_permuted_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_strided_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/expand_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/expand_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/exponential_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/eye_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fftfreq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fft_rfftfreq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/flip_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/floor_divide_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fmod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/frexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/from_file_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/full_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/full_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/geometric_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/glu_backward_jvp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/glu_jvp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hamming_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hann_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hardswish_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/huber_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/index_fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/index_put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/int_repr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_coalesced_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_same_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/isinf_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/isnan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/kaiser_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/kthvalue_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_fresh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_lstsq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matrix_exp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_pinv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linear_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linspace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/log_normal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/log_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logcumsumexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_and_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_not_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_or_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_xor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logspace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logsumexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lshift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lstm_mps_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_scatter_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/matmul_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/median_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_depthwise_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_rnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_rnn_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_input_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mode_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mps_convolution_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mps_convolution_transpose_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mvlgamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/nanmedian_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_dropout_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_dropout_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_group_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_group_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_layer_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_layer_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_empty_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_empty_strided_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_full_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_ones_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_zeros_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/normal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ones_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ones_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/permute_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/permute_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/pixel_shuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/pixel_unshuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/poisson_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/polar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/polygamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/prod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/q_per_channel_scales_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/q_per_channel_zero_points_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool1d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rad2deg_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rand_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rand_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randint_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randint_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randn_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/random_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randperm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/remainder_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/repeat_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/repeat_interleave_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_as_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_as_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/roll_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rot90_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/row_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/row_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rshift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rsub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/scalar_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/segment_reduce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/set_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_inverse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slow_conv_dilated2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slow_conv_dilated3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/soft_margin_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/soft_margin_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sort_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_compressed_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_coo_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_mask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_resize_and_clear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_xlog1py_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_zeta_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_with_sizes_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_with_sizes_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/squeeze_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/squeeze_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/stack_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/std_mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sym_constrain_range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/t_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/to_mkldnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/to_padded_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/trace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/transpose_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/tril_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/triu_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unbind_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unfold_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unfold_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/uniform_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_consecutive_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_dim_consecutive_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsafe_split_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsafe_split_with_sizes_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsqueeze_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsqueeze_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/values_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/var_mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/vdot_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_as_complex_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_as_real_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/xlogy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zeros_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zeros_like_compositeexplicitautograd_dispatch.h>
+
+
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h
new file mode 100644
index 0000000000000000000000000000000000000000..931cd86e77d984cc3b69aca0516b7c3489320825
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Context.h
@@ -0,0 +1,560 @@
+#pragma once
+
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/LinalgBackend.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/detail/HIPHooksInterface.h>
+#include <ATen/detail/IPUHooksInterface.h>
+#include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <ATen/detail/ORTHooksInterface.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+#include <ATen/detail/XPUHooksInterface.h>
+#include <c10/core/QEngine.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+#include <cstdint>
+#include <mutex>
+
+namespace at {
+
+class Tensor;
+
+enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
+
+class TORCH_API Context {
+ public:
+  Context();
+
+  const Generator& defaultGenerator(Device device) {
+    c10::DeviceType device_type = device.type();
+    initCUDAIfNeeded(device_type);
+    initHIPIfNeeded(device_type);
+    if (device_type == at::kCPU) {
+      return at::detail::getDefaultCPUGenerator();
+    } else if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks().getDefaultMPSGenerator();
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
+    } else if (device_type == at::kIPU) {
+      return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
+    } else if (device_type == at::kPrivateUse1) {
+      return at::GetPrivateUse1HooksInterface()->getDefaultGenerator(
+          device.index());
+    } else {
+      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+    }
+  }
+  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+    c10::DeviceType device_type = opt_device_type.has_value()
+        ? opt_device_type.value()
+        : at::getAccelerator(true).value();
+    if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks();
+    } else if (device_type == at::kPrivateUse1) {
+      return at::detail::getPrivateUse1Hooks();
+    } else {
+      AT_ERROR(
+          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+    }
+  }
+  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
+    initCUDAIfNeeded(device_type);
+    initHIPIfNeeded(device_type);
+    initXPUIfNeeded(device_type);
+    if (device_type == at::kCPU) {
+      return c10::DeviceType::CPU;
+    } else if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kPrivateUse1) {
+      return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
+    } else {
+      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+    }
+  }
+  static bool isPinnedPtr(const void* data) {
+    return detail::getCUDAHooks().isPinnedPtr(data);
+  }
+  static bool hasOpenMP();
+  static bool hasMKL();
+  static bool hasLAPACK();
+  static bool hasMKLDNN();
+  static bool hasMAGMA() {
+    return detail::getCUDAHooks().hasMAGMA();
+  }
+  static bool hasCUDA() {
+    return detail::getCUDAHooks().hasCUDA();
+  }
+  static bool hasMTIA() {
+    return detail::getMTIAHooks().hasMTIA();
+  }
+  static bool hasCUDART() {
+    return detail::getCUDAHooks().hasCUDART();
+  }
+  static long versionCUDART() {
+    return detail::getCUDAHooks().versionCUDART();
+  }
+  static bool hasCuDNN() {
+    return detail::getCUDAHooks().hasCuDNN();
+  }
+  static long versionCuDNN() {
+    return detail::getCUDAHooks().versionCuDNN();
+  }
+  static bool hasCuSOLVER() {
+    return detail::getCUDAHooks().hasCuSOLVER();
+  }
+  static bool hasHIP() {
+    return detail::getHIPHooks().hasHIP();
+  }
+  static bool hasMPS() {
+    return detail::getMPSHooks().hasMPS();
+  }
+  static bool hasIPU() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
+  }
+  static bool hasXLA() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
+  }
+  static bool hasXPU() {
+    return detail::getXPUHooks().hasXPU();
+  }
+  static bool hasLazy() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy);
+  }
+  static bool hasORT() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT);
+  }
+  // defined in header so that getNonVariableType has ability to inline
+  // call_once check. getNonVariableType is called fairly frequently
+  void lazyInitCUDA() {
+    c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); });
+  }
+  void lazyInitHIP() {
+    c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
+  }
+  void lazyInitXPU() {
+    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
+  }
+  void lazyInitPrivateUse1() {
+    c10::call_once(thp_init, [&] {
+      if (isPrivateUse1HooksRegistered()) {
+        at::GetPrivateUse1HooksInterface()->initPrivateUse1();
+      }
+    });
+  }
+  static const at::cuda::NVRTC& getNVRTC() {
+    return detail::getCUDAHooks().nvrtc();
+  }
+
+  static bool setFlushDenormal(bool on);
+
+  // NB: This method is *purely* whether or not a user requested
+  // that CuDNN was enabled, it doesn't actually say anything about
+  // whether or not CuDNN is actually usable.  Use cudnn_is_acceptable
+  // to test this instead
+  bool userEnabledCuDNN() const;
+  void setUserEnabledCuDNN(bool e);
+  bool userEnabledMkldnn() const;
+  void setUserEnabledMkldnn(bool e);
+  bool benchmarkCuDNN() const;
+  void setBenchmarkCuDNN(bool);
+  int benchmarkLimitCuDNN() const;
+  void setBenchmarkLimitCuDNN(int);
+  bool deterministicCuDNN() const;
+  void setDeterministicCuDNN(bool);
+  bool userEnabledNNPACK() const;
+  void setUserEnabledNNPACK(bool e);
+
+  // Note [Disabling Fused SDP Kernels]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Flash and Memory Efficient SDP kernels are enabled by default.
+  // However, they can be disabled by setting
+  // at::globalContext().setUserEnabledFlashSDP(false) flag.
+  // This is useful for debugging purposes. For example, if you want to
+  // compare the performance of the flash SDP kernels with the unfused
+  // kernel, you can disable the flash SDP kernels. By disabling
+  // the math SDP kernel, you can force your code to use flash kernels.
+  // The math SDP kernel can be disabled by setting
+  // at::globalContext().setUserEnabledMathSDP(false) flag.
+  void setSDPUseFlash(bool);
+  bool userEnabledFlashSDP() const;
+
+  void setSDPUseMemEfficient(bool);
+  bool userEnabledMemEfficientSDP() const;
+
+  void setSDPUseMath(bool);
+  bool userEnabledMathSDP() const;
+
+  void setSDPUseCuDNN(bool);
+  bool userEnabledCuDNNSDP() const;
+
+  at::LinalgBackend linalgPreferredBackend() const;
+  void setLinalgPreferredBackend(at::LinalgBackend);
+
+  // Note [Enabling Deterministic Operations]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Operations in PyTorch that normally act nondeterministically, but have an
+  // alternate deterministic implementation, should satisfy the following
+  // requirements:
+  //
+  // * Include this comment: "See Note [Enabling Deterministic Operations]"
+  //
+  // * Check the value of `at::globalContext().deterministicAlgorithms()` to
+  // toggle
+  //   between nondeterministic and deterministic implementations.
+  //
+  // * Have an entry in the list of PyTorch operations that toggle between
+  // nondeterministic
+  //   and deterministic implementations, in the docstring of
+  //   `use_deterministic_algorithms()` in torch/__init__.py
+  //
+  // `example_func()` below shows an example of toggling between
+  // nondeterministic and deterministic implementations:
+  //
+  //    void example_func() {
+  //      // See Note [Enabling Deterministic Operations]
+  //      if (at::globalContext().deterministicAlgorithms()) {
+  //        example_func_deterministic();
+  //      } else {
+  //        example_func_nondeterministic();
+  //      }
+  //    }
+
+  bool deterministicAlgorithms() const;
+  bool deterministicAlgorithmsWarnOnly() const;
+  void setDeterministicAlgorithms(bool, bool);
+  bool deterministicFillUninitializedMemory() const;
+  void setDeterministicFillUninitializedMemory(bool);
+
+  // Note [Writing Nondeterministic Operations]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Operations in PyTorch that act nondeterministically and do not have an
+  // alternate deterministic implementation should satisfy the following
+  // requirements:
+  //
+  // * Include this comment: "See Note [Writing Nondeterministic Operations]"
+  //
+  // * Include a comment explaining why the operation is nondeterministic.
+  //
+  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
+  //   of the time, this should be accomplished by calling
+  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
+  //   nondeterministic behavior is caused by the CuBLAS workspace
+  //   configuration in CUDA >= 10.2,
+  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
+  //   called instead (in this case, a comment explaining why the operation is
+  //   nondeterministic is not necessary). See below for details on these
+  //   methods.
+  //
+  // * Have an entry in the list of nondeterministic PyTorch operations in the
+  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
+  //
+  // * Have a test function in `test/test_torch.py` whose name begins with
+  //   `test_nondeterministic_alert_`. Alternatively, if CuBLAS workspace
+  //   configuration is the reason for nondeterminism, the operation should be
+  //   included in the `test_cublas_config_nondeterministic_alert` test. Any new
+  //   tests should ideally follow a pattern similar to the existing ones.
+  //
+  // `example_func()` below shows an example of the comments and error-throwing
+  // code for a nondeterministic operation:
+  //
+  //    void example_func() {
+  //      // See Note [Writing Nondeterministic Operations]
+  //      // Nondeterministic because <reason>
+  //      at::globalContext().alertNondeterministic("example_func");
+  //      ...
+  //    }
+
+  // Throws an error if `Context::deterministicAlgorithms()` is true
+  static void alertNotDeterministic(c10::string_view const& caller);
+
+  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
+  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
+  // ":4096:8". For more details:
+  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+  void alertCuBLASConfigNotDeterministic() const;
+
+  void setFloat32MatmulPrecision(const std::string& s);
+  bool allowTF32CuDNN() const;
+  void setAllowTF32CuDNN(bool);
+  bool allowTF32CuBLAS() const;
+  void setAllowTF32CuBLAS(bool);
+  Float32MatmulPrecision float32MatmulPrecision() const;
+  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
+  bool allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
+  at::QEngine qEngine() const;
+  void setQEngine(at::QEngine e);
+  static const std::vector<at::QEngine>& supportedQEngines();
+  static bool isXNNPACKAvailable();
+  void setCheckSparseTensorInvariants(bool e);
+  bool checkSparseTensorInvariants() const;
+  // This method is used to release the original weight after pre-packing.
+  // It should be called once before loading/running the model.
+  // NB: By default it is set to true for mobile builds.
+  void setReleaseWeightsWhenPrepacking(bool e);
+  bool releaseWeightsWhenPrepacking() const;
+
+  void setDisplayVmapFallbackWarnings(bool enabled);
+  bool areVmapFallbackWarningsEnabled() const;
+
+  void setDefaultMobileCPUAllocator();
+  void unsetDefaultMobileCPUAllocator();
+  bool allowFP16ReductionCPU() const;
+  void setAllowFP16ReductionCPU(bool);
+
+ private:
+  void initCUDAIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::CUDA) {
+      lazyInitCUDA();
+    }
+  }
+  void initHIPIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::HIP) {
+      lazyInitHIP();
+    }
+  }
+  void initXPUIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::XPU) {
+      lazyInitXPU();
+    }
+  }
+  static bool checkCuBLASConfigDeterministic();
+  c10::once_flag thc_init;
+  c10::once_flag thh_init;
+  c10::once_flag thx_init;
+  c10::once_flag thp_init;
+  bool enabled_cudnn = true;
+  bool deterministic_cudnn = false;
+  bool _deterministic_algorithms = false;
+  bool _deterministic_algorithms_warn_only = false;
+  bool _deterministic_fill_uninitialized_memory = true;
+  bool enabled_flashSDP = true;
+  bool enabled_mem_efficientSDP = true;
+  bool enabled_mathSDP = true;
+  bool enabled_cudnnSDP = false;
+#ifdef USE_ROCM
+  bool benchmark_cudnn = true;
+#else
+  bool benchmark_cudnn = false;
+#endif
+  Float32MatmulPrecision float32_matmul_precision =
+      c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
+      ? at::Float32MatmulPrecision::HIGH
+      : at::Float32MatmulPrecision::HIGHEST;
+  int benchmark_limit_cudnn = 10;
+  bool allow_tf32_cudnn = true;
+  bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
+  bool enabled_mkldnn = true;
+  bool enabled_nnpack = true;
+  at::LinalgBackend linalg_preferred_backend =
+      c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true
+      ? at::LinalgBackend::Cusolver
+      : at::LinalgBackend::Default;
+#ifdef C10_MOBILE
+  bool release_original_weights = true;
+#else
+  bool release_original_weights = false;
+#endif
+  bool display_vmap_fallback_warnings_ = false;
+  c10::optional<at::QEngine> quantized_engine = c10::nullopt;
+  bool enable_sparse_tensor_invariant_checks = false;
+  bool allow_fp16_reduction_cpu = false;
+
+  Allocator* prev_allocator_ptr_{nullptr};
+};
+
+TORCH_API Context& globalContext();
+
+static inline void init() {
+  globalContext();
+}
+
+TORCH_API Allocator* getCPUAllocator();
+
+static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
+    Backend p,
+    ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      p, s);
+}
+
+static inline DeprecatedTypeProperties& CPU(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::CPU, s);
+}
+
+static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::CUDA, s);
+}
+
+static inline DeprecatedTypeProperties& HIP(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::HIP, s);
+}
+
+static inline DeprecatedTypeProperties& MPS(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::MPS, s);
+}
+
+static inline bool hasCUDA() {
+  return globalContext().hasCUDA();
+}
+
+static inline bool hasMTIA() {
+  return globalContext().hasMTIA();
+}
+
+static inline bool hasHIP() {
+  return globalContext().hasHIP();
+}
+
+static inline bool hasIPU() {
+  return globalContext().hasIPU();
+}
+
+static inline bool hasXLA() {
+  return globalContext().hasXLA();
+}
+
+static inline bool hasMPS() {
+  return globalContext().hasMPS();
+}
+
+static inline bool hasORT() {
+  return globalContext().hasORT();
+}
+
+static inline bool hasXPU() {
+  return globalContext().hasXPU();
+}
+
+// Despite its name, this function returns the number of *CUDA* GPUs.
+static inline size_t getNumGPUs() {
+  // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
+  // FUNCTION.  If you are interested in interrogating the number of
+  // devices for a specific device type, add that function to the
+  // relevant library (e.g., similar to at::cuda::device_count())
+  if (hasCUDA() && hasHIP()) {
+    throw std::runtime_error(
+        "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades "
+        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
+        "means HIP.  Rebuild PyTorch with one or the other disabled.");
+  } else if (hasCUDA()) {
+    return detail::getCUDAHooks().getNumGPUs();
+  } else if (hasHIP()) {
+    return detail::getHIPHooks().getNumGPUs();
+  } else {
+    return 0;
+  }
+}
+
+static inline bool hasOpenMP() {
+  return globalContext().hasOpenMP();
+}
+
+static inline bool hasMKL() {
+  return globalContext().hasMKL();
+}
+
+static inline bool hasLAPACK() {
+  return globalContext().hasLAPACK();
+}
+
+static inline bool hasMAGMA() {
+  return globalContext().hasMAGMA();
+}
+
+static inline bool hasMKLDNN() {
+  return globalContext().hasMKLDNN();
+}
+
+static inline void manual_seed(uint64_t seed) {
+  auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen.mutex());
+    gen.set_current_seed(seed);
+  }
+  // NB: Sometimes we build with CUDA, but we don't have any GPUs
+  // available. In that case, we must not seed CUDA; it will fail!
+  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
+  if (hasCUDA() && cuda_num_gpus > 0) {
+    for (const auto i : c10::irange(cuda_num_gpus)) {
+      auto cuda_gen = globalContext().defaultGenerator(
+          Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(cuda_gen.mutex());
+        cuda_gen.set_current_seed(seed);
+      }
+    }
+  }
+
+  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
+  if (hasXPU() && xpu_num_gpus) {
+    for (const auto i : c10::irange(xpu_num_gpus)) {
+      auto xpu_gen = globalContext().defaultGenerator(
+          Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(xpu_gen.mutex());
+        xpu_gen.set_current_seed(seed);
+      }
+    }
+  }
+
+  if (hasMPS()) {
+    auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(mps_gen.mutex());
+    mps_gen.set_current_seed(seed);
+  }
+}
+
+// When the global flag `allow_tf32` is set to true, cuBLAS handles are
+// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
+// For some operators, such as addmv, TF32 offers no performance improvement
+// but causes precision loss. To help this case, this class implements
+// a RAII guard that can be used to quickly disable TF32 within its scope.
+//
+// Usage:
+//     NoTF32Guard disable_tf32;
+struct TORCH_API NoTF32Guard {
+  NoTF32Guard();
+  ~NoTF32Guard();
+  static bool should_disable_tf32();
+
+ private:
+  bool changed = false;
+};
+
+struct TORCH_API ROCmBackwardPassGuard {
+  ROCmBackwardPassGuard();
+  ~ROCmBackwardPassGuard();
+  static bool is_backward_pass();
+};
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3e800c7e07c65c4289baa46ba29d9b61cc5dd20
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DeviceAccelerator.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS, XPU
+
+namespace at {
+
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h
new file mode 100644
index 0000000000000000000000000000000000000000..111c7eb8f5fc7cd20a3eb812450324788608c011
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/InferSize.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <ATen/DimVector.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Optional.h>
+#include <sstream>
+#include <vector>
+
+namespace at {
+
+// Infers the size of a dim with size -1, if it exists. Also checks that new
+// shape is compatible with the number of elements.
+//
+// templated to handle std::vector<int64_t> and DimVector use cases, see
+// below
+//
+template <typename InputArrayRef, typename NumelType, typename ResultVec>
+inline void infer_size_impl(
+    InputArrayRef shape,
+    NumelType numel,
+    ResultVec& res) {
+  NumelType newsize = 1;
+  // N.B. this is an index, not a sym dim!
+  auto infer_dim = c10::optional<int64_t>();
+  for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
+    if (shape[dim] == -1) {
+      if (infer_dim) {
+        throw std::runtime_error("only one dimension can be inferred");
+      }
+      infer_dim = dim;
+    } else if (shape[dim] >= 0) {
+      newsize *= shape[dim];
+    } else {
+      AT_ERROR("invalid shape dimension ", shape[dim]);
+    }
+  }
+
+  if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) {
+    if (infer_dim) {
+      // We have a degree of freedom here to select the dimension size; follow
+      // NumPy semantics and just bail.  However, a nice error message is needed
+      // because users often use `view` as a way to flatten & unflatten
+      // dimensions and will otherwise be confused why
+      //   empty_tensor.view( 0, 0)
+      // works yet
+      //   empty_tensor.view(-1, 0)
+      // doesn't.
+      TORCH_CHECK(
+          newsize != 0,
+          "cannot reshape tensor of 0 elements into shape ",
+          shape,
+          " because the unspecified dimension size -1 can be any "
+          "value and is ambiguous");
+      res[*infer_dim] = numel / newsize;
+    }
+    return;
+  }
+
+  std::ostringstream ss;
+  ss << "shape '" << shape << "' is invalid for input of size " << numel;
+  throw std::runtime_error(ss.str());
+}
+
+inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
+  auto res = shape.vec();
+  infer_size_impl(shape, numel, res);
+  return res;
+}
+
+inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {
+  auto res = at::DimVector(shape);
+  infer_size_impl(shape, numel, res);
+  return res;
+}
+
+inline at::SymDimVector infer_size_dv(
+    c10::SymIntArrayRef shape,
+    c10::SymInt numel) {
+  auto res = at::SymDimVector(shape);
+  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
+      shape, std::move(numel), res);
+  return res;
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h
new file mode 100644
index 0000000000000000000000000000000000000000..042cd92da19345d7523671ca75da7279d13062a9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelFuture.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <functional>
+
+namespace at {
+
+// Launches intra-op parallel task, returns a future
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
+    std::function<void()> func);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..e12557428f15674e4382983c07de64c3e43e8af0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Scalar.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h
new file mode 100644
index 0000000000000000000000000000000000000000..2181250740e23808f06e63660f50ca887169bcb1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarType.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <ATen/core/ATenGeneral.h> // for BC reasons
+#include <c10/core/Backend.h>
+#include <c10/core/ScalarType.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..515ddc6e7e18d9e11b391ec10ddcb47f6c9838d8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseCsrTensorImpl.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>
+namespace at {
+
+// Struct implementing a sparse CSR tensor. It uses three 1-D tensors for
+// denoting the data: `crow_indices_`, `col_indices_` and `values_`.
+// The `crow_indices_` tensor is a integer tensor of shape `(size(0) + 1)`
+// that represents the compressed row indices of the CSR tensor. The
+// `col_indices_` tensor is an integer tensor of shape `(nnz())`
+// that explicitly stores the column indices of each value of the sparse
+// tensor. The `values_` tensor can be of any pytorch-supported data type
+// and has shape `(nnz())`.
+//
+// Since the main advantage of the CSR format over the COO format is speed of
+// computation, care must be taken to facilitate smooth interfacing of
+// these data structures with optimized libraries such as MKL and MAGMA.
+// Since the MKL interface for pytorch currently uses indexing with int32
+// type, it is important to make sure that the `crow_indices` and `col_indices`
+// are of type int32 when calling MKL routines such as SPMM or SPMV.
+//
+// If not calling MKL, it should be alright to use 64 bit integer tensors
+// for indexing.
+struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
+  Tensor crow_indices_;
+  Tensor col_indices_;
+  Tensor values_;
+  Layout layout_;
+
+ public:
+  explicit SparseCsrTensorImpl(
+      at::DispatchKeySet,
+      at::Device device,
+      Layout layout,
+      const caffe2::TypeMeta);
+
+  void resize_(int64_t nnz, IntArrayRef size);
+  void resize_and_clear_(
+      int64_t sparse_dim,
+      int64_t dense_dim,
+      IntArrayRef size);
+  void resize_as_sparse_compressed_tensor_(const Tensor& src);
+  void set_member_tensors(
+      const Tensor& crow_indices,
+      const Tensor& col_indices,
+      const Tensor& values,
+      c10::SymIntArrayRef size);
+  void set_member_tensors(
+      const Tensor& crow_indices,
+      const Tensor& col_indices,
+      const Tensor& values,
+      IntArrayRef size);
+  const Tensor& compressed_indices() const {
+    return crow_indices_;
+  }
+  const Tensor& plain_indices() const {
+    return col_indices_;
+  }
+  const Tensor& values() const {
+    return values_;
+  }
+  int64_t nnz() {
+    return col_indices_.size(-1);
+  }
+
+  inline int64_t batch_dim() const noexcept {
+    return crow_indices_.dim() - 1;
+  }
+
+  inline int64_t sparse_dim() const noexcept {
+    return 2;
+  }
+
+  inline int64_t dense_dim() const noexcept {
+    return values_.dim() - batch_dim() - block_dim() - 1;
+  }
+
+ private:
+  inline int64_t block_dim() const noexcept {
+    return (layout_ == kSparseBsr || layout_ == kSparseBsc ? 2 : 0);
+  }
+
+ protected:
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
+  bool is_contiguous_custom(MemoryFormat) const override;
+
+ public:
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  void set_storage_offset(int64_t storage_offset) override;
+  Layout layout_impl() const override {
+    return layout_;
+  }
+  void set_layout(Layout layout) {
+    switch (layout) {
+      case kSparseCsr:
+      case kSparseCsc:
+      case kSparseBsr:
+      case kSparseBsc:
+        layout_ = layout;
+        break;
+      default:
+        TORCH_CHECK(false, "unsupported layout ", layout);
+    }
+  }
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
+        key_set(), device(), layout_impl(), dtype());
+    copy_tensor_metadata(
+        /*src_sparse_impl=*/this,
+        /*dest_sparse_impl=*/impl.get(),
+        /*version_counter=*/version_counter,
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
+        key_set(), device(), layout_impl(), dtype());
+    copy_tensor_metadata(
+        /*src_sparse_impl=*/this,
+        /*dest_sparse_impl=*/impl.get(),
+        /*version_counter=*/std::move(version_counter),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
+ private:
+  explicit SparseCsrTensorImpl(
+      at::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      at::Tensor crow_indices,
+      at::Tensor col_indices,
+      at::Tensor values,
+      at::Layout layout);
+
+  const char* tensorimpl_type_name() const override;
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const SparseCsrTensorImpl* src_sparse_impl,
+      SparseCsrTensorImpl* dest_sparse_impl,
+      c10::VariableVersion version_counter,
+      bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_sparse_impl,
+        dest_sparse_impl,
+        std::move(version_counter),
+        allow_tensor_metadata_change);
+
+    // Sparse-specific fields
+    dest_sparse_impl->crow_indices_ = src_sparse_impl->compressed_indices();
+    dest_sparse_impl->col_indices_ = src_sparse_impl->plain_indices();
+    dest_sparse_impl->values_ = src_sparse_impl->values();
+    dest_sparse_impl->layout_ = src_sparse_impl->layout_impl();
+  }
+};
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b3719cca3bf1ff7154625c510c8292dd47444a7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Tensor.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a1a2bce60808a67629bcd53492413017cd1d429
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIterator.h
@@ -0,0 +1,1002 @@
+#pragma once
+
+#include <ATen/TensorMeta.h>
+#include <ATen/core/Dimname.h>
+#include <ATen/core/Range.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/DynamicCast.h>
+#include <c10/util/FunctionRef.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/irange.h>
+
+#include <array>
+#include <bitset>
+
+namespace at {
+class Tensor;
+class OptionalTensorRef;
+using NameVector = SmallVector<Dimname, kDimVectorStaticSize>;
+} // namespace at
+
+// TensorIterator is a helper class for element-wise operations, such as
+// arithmetic, comparisons, and trigonometric functions. It handles
+// broadcasting and type conversions of operands.
+//
+// This is inspired by NumPy's Array Iterator API (NpyIter).
+//
+// The files Loops.h and Loops.cuh provide functions to build kernels that
+// use TensorIterator.
+//
+// Example:
+//
+//   auto iter = TensorIteratorConfig()
+//     .add_output(output)
+//     .add_input(input)
+//     .build()
+//
+// [MyKernel.cpp / MyKernel.cu]
+//   cpu_kernel(iter, [](float a, float b) {
+//     return a + b;
+//   });
+//
+//   gpu_kernel(iter, []GPU_LAMBDA(float a, float b) -> float {
+//     return a + b;
+//   });
+//
+// Note [Order of Construction]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// When setting up the tensor iterator configuration, the output Tensors
+// have to be added first via
+// TensorIteratorConfig::add_owned_output(at::Tensor). After adding all outputs,
+// the inputs can be added via
+// TensorIteratorConfig::add_owned_input(at::Tensor).
+// Adding another output after inputs have been added will rise an exception.
+//
+// Note [Common Dtype Computation]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Some operations have a natural notion of a "common dtype" or
+//   "computation dtype" where all inputs are cast to one dtype, the
+//   operation is performed, and then the results are cast to all outputs.
+//
+// TensorIterator infers a common dtype if all inputs have the same dtype,
+//   and it computes one using type promotion rules on its inputs if
+//   promote_inputs_to_common_dtype_ is true. Attempting to query
+//   a common dtype otherwise will throw an exception.
+//
+// Note that the outputs are not considered when computing a common dtype.
+
+namespace at {
+
+namespace internal {
+// This parameter is heuristically chosen to determine the minimum number of
+// work that warrants parallelism. For example, when summing an array, it is
+// deemed inefficient to parallelise over arrays shorter than 32768. Further,
+// no parallel algorithm (such as parallel_reduce) should split work into
+// smaller than GRAIN_SIZE chunks.
+constexpr int64_t GRAIN_SIZE = 32768;
+
+// Storage for a non-owning Tensor, without needing to include Tensor.h
+class TORCH_API OpaqueOptionalTensorRef {
+  alignas(alignof(TensorBase)) std::array<char, sizeof(TensorBase)> data_{};
+
+ public:
+  OpaqueOptionalTensorRef();
+  OpaqueOptionalTensorRef(const OpaqueOptionalTensorRef&) = default;
+  OpaqueOptionalTensorRef& operator=(const OpaqueOptionalTensorRef&) = default;
+  OpaqueOptionalTensorRef(OpaqueOptionalTensorRef&&) noexcept = default;
+  OpaqueOptionalTensorRef& operator=(OpaqueOptionalTensorRef&&) noexcept =
+      default;
+  ~OpaqueOptionalTensorRef();
+
+  OptionalTensorRef* get() {
+    return reinterpret_cast<OptionalTensorRef*>(data_.data());
+  }
+  const OptionalTensorRef* get() const {
+    return reinterpret_cast<const OptionalTensorRef*>(data_.data());
+  }
+
+  OptionalTensorRef& operator*() {
+    return *get();
+  }
+  const OptionalTensorRef& operator*() const {
+    return *get();
+  }
+  OptionalTensorRef* operator->() {
+    return get();
+  }
+  const OptionalTensorRef* operator->() const {
+    return get();
+  }
+
+  const Tensor& getTensor() const;
+};
+} // namespace internal
+
+struct TORCH_API OperandInfo {
+  using StrideVector = SmallVector<int64_t, 6>;
+  OperandInfo() = default;
+  C10_ALWAYS_INLINE explicit OperandInfo(c10::MaybeOwned<TensorBase>&& t) {
+    if (t->defined()) {
+      device = t->device();
+      target_dtype = t->scalar_type();
+      current_dtype = target_dtype;
+    }
+    tensor(std::move(t));
+    validate();
+  }
+
+  C10_ALWAYS_INLINE OperandInfo(const OperandInfo&) = default;
+  C10_ALWAYS_INLINE OperandInfo& operator=(const OperandInfo&) = default;
+  C10_ALWAYS_INLINE OperandInfo(OperandInfo&&) noexcept = default;
+  C10_ALWAYS_INLINE OperandInfo& operator=(OperandInfo&&) noexcept = default;
+  C10_ALWAYS_INLINE ~OperandInfo() = default;
+
+  /// The data pointer. This may be different from tensor->data_ptr() if the
+  /// iterator is split.
+  void* data = nullptr;
+
+  /// Stride after broadcasting. The stride is in bytes, not number of elements.
+  StrideVector stride_bytes;
+
+  /// The desired device and type for the operand. For inputs, this specifies
+  /// that the input should be converted to this type if necessary. For outputs,
+  /// this specifies which type to allocate. target_dtype and device are
+  /// initialized with the dtype and device of the tensor but during type
+  /// promotion target_dtype value can become different from tensor's dtype
+  /// also, during type promotion target_dtype and device can be set for an
+  /// undefined tensor so that tensor can be properly constructed later.
+  c10::optional<Device> device = c10::nullopt;
+  ScalarType target_dtype = ScalarType::Undefined;
+  // Caches dtype of the tensor, because scalar_type is an expensive operation
+  // If dtype of the tensor is changed (e.g. as a result of type promotion or in
+  // allocate_outputs), this
+  // value should be changed too.
+  ScalarType current_dtype = ScalarType::Undefined;
+
+  bool is_device_defined() const {
+    return device.has_value();
+  }
+  bool is_type_defined() const {
+    return target_dtype != ScalarType::Undefined;
+  }
+  TensorOptions options() const {
+    return TensorOptions(target_dtype).device(device);
+  }
+
+  bool is_output = false;
+
+  bool will_resize = false;
+
+  bool is_read_write = false;
+
+  bool is_const = false;
+
+  void validate() {
+    TORCH_CHECK(
+        !tensor_base_->defined() || tensor_base_->layout() == kStrided,
+        "unsupported tensor layout: ",
+        tensor_base_->layout());
+  }
+
+  /// The tensor operand. Note that the strides, data pointer, and
+  /// other attributes may differ due to dimension reordering and
+  /// coalescing.
+  const Tensor& tensor() const {
+    return tensor_storage_.getTensor();
+  }
+  const TensorBase& tensor_base() const {
+    return *tensor_base_;
+  }
+  void tensor(c10::MaybeOwned<TensorBase>&& tensor);
+
+  // Save the original tensor operand in cases when an output is modified
+  // (e.g. if dtype is changed)
+  const Tensor& original_tensor() const {
+    return original_tensor_storage_.getTensor();
+  }
+  const TensorBase& original_tensor_base() const {
+    return *original_tensor_base_;
+  }
+
+  // Set tensor to a new value, and store the old tensor value in
+  // original_tensor Should only ever be called once for the lifetime of an
+  // operand
+  void exchange_tensor(c10::MaybeOwned<TensorBase>&& new_tensor);
+
+  // Move original_tensor back into tensor, exchange_tensor must have been
+  // called before
+  void restore_original_tensor();
+
+ private:
+  c10::MaybeOwned<TensorBase> tensor_base_;
+  c10::MaybeOwned<TensorBase> original_tensor_base_ =
+      c10::MaybeOwned<TensorBase>::owned(std::in_place);
+
+  // We store TensorBase visibly in the header to allow inline access.
+  // However, we sometimes need a genuine `const Tensor &` for the
+  // TensorIterator API. So, we also store a non-owning `Tensor`
+  // object in these `_storage_` variables.
+  internal::OpaqueOptionalTensorRef tensor_storage_;
+  internal::OpaqueOptionalTensorRef original_tensor_storage_;
+};
+
+struct SplitUntil32Bit;
+
+enum class FastSetupType : uint8_t {
+  NONE,
+  CONTIGUOUS,
+  CHANNELS_LAST,
+  NON_OVERLAPPING_DENSE
+};
+
+class TensorIteratorConfig;
+struct TensorIterator;
+
+struct TORCH_API TensorIteratorBase : public impl::MetaBase {
+  using DimMask = std::bitset<64>;
+  using PtrVector = SmallVector<char*, 4>;
+  using StrideVector = SmallVector<int64_t, 6>;
+
+  TensorIteratorBase();
+  void build(TensorIteratorConfig&);
+
+  // The inner-loop function operates on the fastest moving dimension. It
+  // implements element-wise operations in terms of 1-d strided tensors.
+  //
+  // Arguments:
+  //  data: data pointers for each operand (length `ntensors`)
+  //  strides: stride for each operand (length `ntensors`)
+  //  size: size of inner loop
+  //
+  // The `size` often matches shape[0], but may be smaller due to
+  // parallelization of the inner loop.
+  using loop2d_t = c10::function_ref<
+      void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>;
+
+  using loop_subiter_t = c10::function_ref<void(TensorIteratorBase& subiter)>;
+
+  void foreach_reduced_elt(loop_subiter_t loop, bool parallelize = true);
+
+  int ndim() const {
+    return static_cast<int>(shape_.size());
+  }
+  IntArrayRef shape() const {
+    return shape_;
+  }
+  int64_t numel() const;
+  int ntensors() const {
+    return static_cast<int>(operands_.size());
+  }
+  int noutputs() const {
+    return num_outputs_;
+  }
+  int ninputs() const {
+    return ntensors() - noutputs();
+  }
+  IntArrayRef view_offsets() const {
+    return view_offsets_;
+  }
+
+  /// number of elements in the output operand. this is the same as numel() for
+  /// operations that are not reductions.
+  int64_t num_output_elements() const;
+
+  /// number of reduced dimensions in a reduction operation
+  int num_reduce_dims() const;
+
+  /// 1-dimensional iteration and no buffering or type conversion
+  bool is_trivial_1d() const;
+  /// Reducible to 1-dimensional and all operands are contiguous
+  bool is_contiguous() const;
+  bool is_dim_reduced(int dim) const;
+
+  /// Accessors for each operand
+  IntArrayRef strides(int64_t arg) const {
+    return operands_[arg].stride_bytes;
+  }
+  void* data_ptr(int64_t arg) const;
+  ScalarType dtype(int64_t arg = 0) const {
+    return operands_[arg].current_dtype;
+  }
+  ScalarType common_dtype() const {
+    TORCH_INTERNAL_ASSERT(
+        common_dtype_ != ScalarType::Undefined,
+        "Queried for invalid common dtype!");
+    return common_dtype_;
+  }
+  ScalarType input_dtype(int64_t arg = 0) const {
+    return operands_[num_outputs_ + arg].current_dtype;
+  }
+  Device device(int64_t arg = 0) const {
+    return operands_[arg].device.value();
+  }
+  c10::DeviceType device_type(int64_t arg = 0) const {
+    return device(arg).type();
+  }
+  int64_t element_size(int64_t arg) const {
+    return static_cast<int64_t>(elementSize(dtype(arg)));
+  }
+  bool is_scalar(int64_t arg) const;
+  bool is_cpu_scalar(int64_t arg) const;
+
+  const TensorBase& tensor_base(int64_t arg) const {
+    return operands_[arg].tensor_base();
+  }
+  const Tensor& tensor(int64_t arg) const {
+    return operands_[arg].tensor();
+  }
+
+  const TensorBase& output_base(int64_t arg = 0) const {
+    AT_ASSERT(arg < num_outputs_);
+    return tensor_base(arg);
+  }
+
+  const Tensor& output(int64_t arg = 0) const {
+    AT_ASSERT(arg < num_outputs_);
+    return tensor(arg);
+  }
+
+  const TensorBase& input_base(int64_t arg = 0) const {
+    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
+    return tensor_base(num_outputs_ + arg);
+  }
+  const Tensor& input(int64_t arg = 0) const {
+    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
+    return tensor(num_outputs_ + arg);
+  }
+
+  // Copies from temporary outputs back to the original outputs
+  // NOTE: only used on CPU
+  void cast_outputs();
+
+  /// Removes an operand from this iterator
+  void remove_operand(int64_t arg);
+  /// Shrinks an iterated dimension
+  void narrow(int dim, int64_t start, int64_t size);
+  /// Narrows every dim after and including `start_dim` to size one.
+  void select_all_keeping_dim(int start_dim, IntArrayRef starts);
+  /// Replaces the data pointer for the operand at index `arg`.
+  /// The new pointer should have the same sizes, strides and dtype as the
+  /// original
+  void unsafe_replace_operand(int64_t arg, void* data);
+
+  /// Splits this TensorIterator into two iterators. Together they iterate over
+  /// the entire operation. Used by `with_32bit_indexing()`.
+  std::unique_ptr<TensorIterator> split(int dim);
+
+  /// Returns the dimension with the largest extent: (size[dim]-1) * stride[dim]
+  int get_dim_to_split() const;
+
+  template <typename T>
+  T scalar_value(int64_t arg) {
+    auto& op = operands_[arg];
+    return c10::fetch_and_cast<T>(op.tensor_base().scalar_type(), op.data);
+  }
+
+  /// Return scalar value from original_tensor_base if it is defined. When
+  /// common_dtype is Half, casting scalar input to common_dtype might overflow.
+  /// If the scalar is aleady given in the type of Half, then return scalar
+  /// value from tensor_base.
+  template <typename T>
+  T original_scalar_value(int64_t arg) {
+    auto& original_tensor_base = operands_[arg].original_tensor_base();
+    if (original_tensor_base.defined()) {
+      TORCH_INTERNAL_ASSERT(
+          original_tensor_base.scalar_type() != common_dtype());
+      return c10::fetch_and_cast<T>(
+          original_tensor_base.scalar_type(),
+          original_tensor_base.const_data_ptr());
+    } else {
+      return scalar_value<T>(arg);
+    }
+  }
+
+ private:
+  template <typename loop1d_t>
+  auto loop_2d_from_1d(const loop1d_t& loop) {
+    return
+        [loop, ntensor = ntensors()](
+            char** base, const int64_t* strides, int64_t size0, int64_t size1) {
+          PtrVector data(base, base + ntensor);
+          const int64_t* outer_strides = &strides[ntensor];
+          for (const auto i : c10::irange(size1)) {
+            if (i > 0) {
+              for (const auto arg : c10::irange(ntensor)) {
+                data[arg] += outer_strides[arg];
+              }
+            }
+            loop(data.data(), strides, size0);
+          }
+        };
+  }
+
+ public:
+  template <
+      typename loop1d_t,
+      std::enable_if_t<
+          std::is_convertible_v<
+              loop1d_t,
+              c10::function_ref<
+                  void(char**, const int64_t* strides, int64_t size)>>,
+          int> = 0>
+  void for_each(loop1d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE) {
+    for_each(loop_2d_from_1d(loop), grain_size);
+  }
+
+  void for_each(loop2d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);
+
+  void parallel_reduce(loop2d_t loop);
+
+  template <
+      typename loop1d_t,
+      std::enable_if_t<
+          std::is_convertible_v<
+              loop1d_t,
+              c10::function_ref<
+                  void(char**, const int64_t* strides, int64_t size)>>,
+          int> = 0>
+  void serial_for_each(loop1d_t loop, Range range) {
+    serial_for_each(loop_2d_from_1d(loop), range);
+  }
+
+  void serial_for_each(loop2d_t loop, Range range) const;
+
+  /// Create a strides array for a Tensor with shape of this iterator. The
+  /// parameter `element_size` specifies the size of Tensor's data type in
+  /// bytes (e.g. `4` for `float`)
+  StrideVector compatible_stride(int64_t element_size) const;
+
+  /// Inverts the re-ordering done by reorder_dimensions. This can only be
+  /// called *before* coalesce_dimensions() is called.
+  DimVector invert_perm(IntArrayRef input) const;
+
+  /// Reapply same re-ordering as it is done by reorder_dimensions. This can
+  /// only be called *before* coalesce_dimensions() is called.
+  DimVector apply_perm_and_mul(IntArrayRef input, int mul) const;
+
+  /// Helper functions for CPU iteration
+  StrideVector get_dim_strides(int dim) const;
+  StrideVector get_strides() const;
+  StrideVector get_inner_strides() const {
+    return get_dim_strides(0);
+  }
+  PtrVector get_base_ptrs() const;
+
+  // Helper functions for advanced stride manipulations (e.g. torch.flip)
+  void _unsafe_set_arg_strides(const int64_t arg, IntArrayRef strides) {
+    operands_[arg].stride_bytes = strides;
+  }
+  void _unsafe_set_arg_data(const int64_t arg, void* data) {
+    operands_[arg].data = data;
+  }
+
+  /// true if the stride computation can use 32-bit arithmetic. Used by GPU
+  /// kernels
+  bool can_use_32bit_indexing() const;
+
+  /// An "iteratable" object that recursively splits this iterator into
+  /// sub-iterators that can use 32-bit indexing.
+  SplitUntil32Bit with_32bit_indexing() const;
+
+  /// If the kernel should accumulate into the output. Only relevant for CUDA
+  /// reductions.
+  bool should_accumulate() const {
+    return accumulate_;
+  }
+
+  /// Whether this iterator produces the actual output,
+  /// as opposed to something that will be accumulated further. Only relevant
+  /// for CUDA reductions.
+  bool is_final_output() const {
+    return final_output_;
+  }
+
+  bool has_contiguous_first_dim() const {
+    if (ndim() == 0) {
+      return true;
+    }
+
+    int num_tensors = ntensors();
+    for (const auto i : c10::irange(num_tensors)) {
+      if (strides(i)[0] != element_size(i)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void set_output_raw_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options,
+      DimnameList names) override;
+
+#define TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, maybestatic)            \
+  maybestatic void methodname(                                              \
+      TensorBase&& out, const TensorBase& a, const TensorBase& b) = delete; \
+  maybestatic void methodname(                                              \
+      const TensorBase& out, TensorBase&& a, const TensorBase& b) = delete; \
+  maybestatic void methodname(                                              \
+      const TensorBase& out, const TensorBase& a, TensorBase&& b) = delete; \
+  maybestatic void methodname(                                              \
+      TensorBase&& out, TensorBase&& a, const TensorBase& b) = delete;      \
+  maybestatic void methodname(                                              \
+      TensorBase&& out, const TensorBase& a, TensorBase&& b) = delete;      \
+  maybestatic void methodname(                                              \
+      const TensorBase& out, TensorBase&& a, TensorBase&& b) = delete;      \
+  maybestatic void methodname(                                              \
+      TensorBase&& out, TensorBase&& a, TensorBase&& b) = delete;
+
+#define TORCH_DISALLOW_TEMPORARIES(methodname) \
+  TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, )
+
+  void build_binary_float_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  void build_borrowing_binary_float_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_float_op)
+  void build_binary_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  void build_borrowing_binary_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_op)
+  void build_unary_float_op(const TensorBase& out, const TensorBase& a);
+  void build_borrowing_unary_float_op(
+      const TensorBase& out,
+      const TensorBase& a);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_float_op)
+  void build_unary_op(const TensorBase& out, const TensorBase& a);
+  // Odd special case needed for pow. Has to borrow the output because
+  // it's a structured kernel, but the argument is potentially a copy.
+  void build_output_borrowing_argument_owning_unary_op(
+      const TensorBase& out,
+      const TensorBase& a);
+  void build_borrowing_unary_op(const TensorBase& out, const TensorBase& a);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_op)
+  void build_borrowing_unary_force_boolean_op(
+      const TensorBase& out,
+      const TensorBase& a);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_force_boolean_op)
+  void build_comparison_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  void build_borrowing_comparison_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  TORCH_DISALLOW_TEMPORARIES(build_borrowing_comparison_op)
+  // Another special case: we need to own the second argument for comparison
+  // ops.
+  void build_borrowing_except_last_argument_comparison_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  void build_ternary_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b,
+      const TensorBase& c);
+
+#undef TORCH_DISALLOW_TEMPORARIES
+ protected:
+  // Mutable reference as it moves tensors out of TensorIteratorConfig
+  void populate_operands(TensorIteratorConfig&);
+  void mark_outputs();
+  void mark_resize_outputs(const TensorIteratorConfig&);
+  void compute_mem_overlaps(const TensorIteratorConfig&);
+  void compute_shape(const TensorIteratorConfig&);
+  void compute_strides(const TensorIteratorConfig&);
+  void reorder_dimensions();
+  void permute_dimensions(IntArrayRef perm);
+  void compute_types(const TensorIteratorConfig&);
+  ScalarType compute_common_dtype();
+  void allocate_or_resize_outputs();
+  bool fast_set_up(const TensorIteratorConfig&);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
+  void compute_names(const TensorIteratorConfig&);
+  void propagate_names_to_outputs();
+  void coalesce_dimensions();
+
+ protected:
+  /// Records the "computation" shape of the output tensor. The computation
+  /// shape is different from the regular shape in a few ways:
+  ///
+  ///   - The shape may be permuted (via permute_dimensions) so that we
+  ///     process the dimensions in the most computationally efficient order
+  ///     (rather than the logical order given to us by the users.)
+  ///   - The shape may have adjacent dimensions collapsed (via
+  ///     coalesce_dimensions) so that we minimize the number of
+  ///     dimensions we have to explicitly iterate over.  For example,
+  ///     a pointwise operation on a contiguous tensor "computationally"
+  ///     consists of only a single dimension.
+  ///
+  /// In other words, the computation shape is the output shape as it
+  /// actually matters for implementing the kernel, but not necessarily the
+  /// output shape that the user will see in the end.
+  ///
+  /// The lifecycle of mutations to shape_ in TensorIterator:
+  ///   - declare_static_shape() sets an initial shape explicitly
+  ///     provided by user, otherwise
+  ///   - compute_shape() computes the true (non-computational) shape
+  ///     specified by the user.
+  ///   - reorder_dimensions() reorders dimensions to improve coalescing.
+  ///   - coalesce_dimensions() then coalesces adjacent dimensions when
+  ///     possible.
+  ///
+  /// The shape may also be further modified if we create sub-TensorIterators,
+  /// e.g., via narrow or select_all_keeping_dim.
+  DimVector shape_;
+
+  /// Temporarily records the permutation computed by reorder_dimensions.
+  /// This permutation maps the computation output dimension (dim) to
+  /// the original true output dimension (perm_[dim]).  It is used by
+  /// invert_perm to undo the permutation.  After coalesce_dimensions is
+  /// called, the permutation is no longer valid (as, in general, there
+  /// is no permutation that will make computation dimensions to
+  /// output dimensions); methods that manipulate perm_ are obligated
+  /// to test that !has_coalesced_dimensions
+  DimVector perm_;
+
+  /// Has coalesce_dimensions() (or any moral equivalent, e.g., fast_build())
+  /// been called?  This is SOLELY used to check validity of perm_.
+  bool has_coalesced_dimensions_ = false;
+
+  /// Whether iteration must be fixed. This disables dimension permuting and
+  /// also changes how for_each divides work among threads.
+  bool enforce_linear_iteration_ = false;
+
+  /// The index offsets into the original tensors for each dimension.
+  /// This is only non-zero when you narrow() a TensorIterator (e.g.,
+  /// when you make sub-TensorIterators).
+  DimVector view_offsets_;
+
+  /// The computed names of the output tensor.  Computed by compute_names()
+  NameVector names_;
+
+  /// The operands of the TensorIterator: both the inputs and outputs.  The
+  /// outputs MUST come first in the operands_ list.  There is always an
+  /// operand for each output of the TensorIterator, even if TensorIterator
+  /// will ultimately be responsible for allocating the output; in those
+  /// cases, tensor is simply undefined (and will be populated later
+  /// during build()).
+  ///
+  /// This list is initially populated prior to build(), but build() mutates
+  /// OperandInfo to populate more information.
+  SmallVector<OperandInfo, 4> operands_;
+
+  /// Number of outputs in operands_ (the length of the outputs prefix
+  /// in operands_).
+  int num_outputs_ = 0;
+
+  /// Whether or not all operands have the same shape and are 1d+. Having all
+  /// the same shape affects whether or not the iterator is eligible for fast
+  /// setup.
+  bool all_ops_same_shape_ = false;
+  /// Whether or not all operands are 0d, this affects type promotion
+  bool all_ops_are_scalars_ = false;
+
+  /// The "computation" dtype of TensorIterator, specifying what the dtype
+  /// we will do the internal computation in TensorIterator.  Typically,
+  /// this matches the dtype of the output tensors, but not always!
+  ScalarType common_dtype_ = ScalarType::Undefined;
+
+  /// This is currently defined as kCPU, or the device of the first non-CPU
+  /// tensor argument. See TensorIteratorBase::compute_types for details.
+  Device common_device_ = kCPU;
+
+  /// Set by split(), see should_accumulate() and is_final_output()
+  bool accumulate_ = false;
+  bool final_output_ = true;
+
+  // From TensorIteratorConfig
+  bool is_reduction_ = false;
+
+  /// Set by populate_operands(), says if we're handling meta tensors
+  bool is_meta_ = false;
+};
+
+struct TORCH_API TensorIterator final : public TensorIteratorBase {
+  TensorIterator() : TensorIteratorBase() {}
+  // Slicing is OK, TensorIterator guaranteed NOT to have any fields
+  TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}
+
+#define TORCH_DISALLOW_TEMPORARIES(methodname) \
+  TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, static)
+
+  static TensorIterator binary_float_op(
+      TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  static TensorIterator binary_op(
+      TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  static TensorIterator borrowing_binary_op(
+      const TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  TORCH_DISALLOW_TEMPORARIES(borrowing_binary_op)
+  static TensorIterator comparison_op(
+      TensorBase& out,
+      const TensorBase& a,
+      const TensorBase& b);
+  static TensorIterator unary_op(TensorBase& out, const TensorBase& a);
+  static TensorIterator unary_float_op(TensorBase& out, const TensorBase& a);
+  static TensorIterator nullary_op(TensorBase& out);
+  static TensorIterator borrowing_nullary_op(const TensorBase& out);
+  static TensorIterator borrowing_nullary_op(TensorBase&& out) = delete;
+  static TensorIterator reduce_op(TensorBase& out, const TensorBase& a);
+  static TensorIterator reduce_op(
+      TensorBase& out1,
+      TensorBase& out2,
+      const TensorBase& a);
+#undef TORCH_DISALLOW_TEMPORARIES
+#undef TORCH_DISALLOW_TEMPORARIES_IMPL
+
+  const Tensor& maybe_get_output(int64_t output_idx) override;
+  void set_output_raw_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options,
+      DimnameList names) override;
+};
+
+class TORCH_API TensorIteratorConfig final {
+ public:
+  friend struct TensorIteratorBase;
+  friend struct TensorIterator;
+
+  TensorIteratorConfig() = default;
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig);
+
+  /// Construction
+  // Stores input/output Tensors without incrementing the reference count.
+  // Important: the outputs have to be added before the inputs.
+  TensorIteratorConfig& add_output(const TensorBase& output) {
+    return add_borrowed_output(output);
+  }
+  TensorIteratorConfig& add_input(const TensorBase& input) {
+    return add_borrowed_input(input);
+  }
+  TensorIteratorConfig& add_const_input(const TensorBase& input) {
+    return add_borrowed_const_input(input);
+  }
+
+  // Borrowing from temporaries is unlikely to go well.
+  TensorIteratorConfig& add_output(TensorBase&& output) = delete;
+  TensorIteratorConfig& add_input(TensorBase&& input) = delete;
+  TensorIteratorConfig& add_const_input(TensorBase&& input) = delete;
+
+  // Stores input/output Tensors while incrementing the reference count.
+  // Note that add_{in,out}put are nearly always what you
+  // want, and the exception (adding an unnamed temporary) won't
+  // compile.
+  TensorIteratorConfig& add_owned_output(const TensorBase& output);
+  TensorIteratorConfig& add_owned_input(const TensorBase& input);
+  TensorIteratorConfig& add_owned_const_input(const TensorBase& input);
+
+  // Advanced API: stores input/output Tensors without incrementing
+  // the reference count. The caller must ensure that these Tensors
+  // live at least as long as this TensorIteratorConfig and any
+  // TensorIteratorBase built from this TensorIteratorConfig.
+  // Important: the outputs have to be added before the inputs.
+  TensorIteratorConfig& add_borrowed_output(const TensorBase& output);
+  TensorIteratorConfig& add_borrowed_input(const TensorBase& input);
+  TensorIteratorConfig& add_borrowed_const_input(const TensorBase& input);
+
+  // Borrowing from temporaries is unlikely to go well.
+  TensorIteratorConfig& add_borrowed_output(TensorBase&& output) = delete;
+  TensorIteratorConfig& add_borrowed_input(TensorBase&& input) = delete;
+  TensorIteratorConfig& add_borrowed_const_input(TensorBase&& input) = delete;
+
+  // Sets the check_mem_overlap_ flag, which is true by default.
+  // If true, inputs are checked for partial overlap with the outputs and
+  // outputs are checked for internal overlap (e.g. broadcasted views). An error
+  // is raised if unacceptable overlap is detected.
+  // If you're migrating an existing operator to using TensorIterator, please
+  // consider if the previous implementation checked memory overlap. If it did
+  // not, and if the operator is idempotent (for example, Tensor.fill_(0)), then
+  // checking memory overlap is BC-breaking. Please don't check memory overlap
+  // in that case.
+  TensorIteratorConfig& set_check_mem_overlap(bool check_mem_overlap) {
+    check_mem_overlap_ = check_mem_overlap;
+    return *this;
+  }
+
+  // Sets the check_all_same_dtype_ flag, which is true by default
+  // If true, checks that all inputs and defined outputs have the same dtype
+  // Setting either of promote_inputs_to_common_dtype_
+  //   or cast_common_dtype_to_outputs_ to true will set
+  //   check_all_same_dtype_ to false.
+  TensorIteratorConfig& check_all_same_dtype(const bool _check_all_same_dtype) {
+    check_all_same_dtype_ = _check_all_same_dtype;
+    return *this;
+  }
+
+  // Sets the check_all_same_device_ flag, which is true by default
+  // If true, all operands must be on the same device, with the possible
+  //   exception of CPU scalars, which can be passed to some CUDA kernels
+  //   as kernel arguments.
+  TensorIteratorConfig& check_all_same_device(
+      const bool _check_all_same_device) {
+    check_all_same_device_ = _check_all_same_device;
+    return *this;
+  }
+
+  // Sets the enforce_safe_casting_to_output_ flag, which is false by default
+  // If true, the iterator's "common dtype" must be computable
+  //   (see the [Common Dtype Computation] note) and
+  //   canCast(common dtype, output dtype) must be true for all outputs.
+  TensorIteratorConfig& enforce_safe_casting_to_output(
+      const bool _enforce_safe_casting_to_output) {
+    enforce_safe_casting_to_output_ = _enforce_safe_casting_to_output;
+    return *this;
+  }
+
+  // Sets the enforce_linear_iteration_ flag, which is false by default.
+  // If true, iteration goes in the same order as a C-contiguous tensor
+  // is layed out in memory. i.e. last dimension iterates fastest.
+  //
+  // This iteration order can be less efficient and may even prevent
+  // vectorization. So only use if the correctness of your kernel depends on it.
+  TensorIteratorConfig& enforce_linear_iteration(
+      const bool _enforce_linear_iteration = true) {
+    enforce_linear_iteration_ = _enforce_linear_iteration;
+    return *this;
+  }
+
+  // Sets the promote_inputs_to_common_dtype_ flag, which is false by default
+  // If true, the iterator's "common dtype" is always computed (see the
+  //   [Common Dtype Computation] note) and, on the CPU, temporary copies of
+  //   the inputs in the common dtype are passed as the actual inputs to
+  //   the operation.
+  // Setting this flag to true sets check_all_same_dtype_ to false.
+  TensorIteratorConfig& promote_inputs_to_common_dtype(
+      const bool _promote_inputs_to_common_dtype) {
+    promote_inputs_to_common_dtype_ = _promote_inputs_to_common_dtype;
+    if (_promote_inputs_to_common_dtype) {
+      check_all_same_dtype_ = false;
+    }
+    return *this;
+  }
+
+  // Sets the promote_integer_inputs_to_float_ flag, which is false by default
+  // NOTE: If set to true, the promote_inputs_to_common_dtype_ must also be
+  // true. If true, if the iterator's "common dtype" is an integral type
+  // (including bool)
+  //   then it is changed to the default float scalar type.
+  TensorIteratorConfig& promote_integer_inputs_to_float(
+      const bool _promote_integer_inputs_to_float) {
+    promote_integer_inputs_to_float_ = _promote_integer_inputs_to_float;
+    TORCH_INTERNAL_ASSERT(
+        !promote_integer_inputs_to_float_ || promote_inputs_to_common_dtype_);
+    return *this;
+  }
+
+  TensorIteratorConfig& is_reduction(const bool _is_reduction) {
+    is_reduction_ = _is_reduction;
+    return *this;
+  }
+
+  TensorIteratorConfig& allow_cpu_scalars(const bool _allow_cpu_scalars) {
+    allow_cpu_scalars_ = _allow_cpu_scalars;
+    return *this;
+  }
+
+  // Sets the cast_common_dtype_to_outputs_ flag, which is false by default
+  // If true, the iterator's "common dtype" must be computatable
+  //   (see the [Common Dtype Computation] note) and, on the CPU, temporary
+  //   copies of the outputs are passed as the actual output to the operation.
+  //   These temporaries are then copied to the original outputs after
+  //   the operation is performed (see cast_outputs()).
+  // Setting this flag to true sets check_all_same_dtype_ to false.
+  TensorIteratorConfig& cast_common_dtype_to_outputs(
+      const bool _cast_common_dtype_to_outputs) {
+    cast_common_dtype_to_outputs_ = _cast_common_dtype_to_outputs;
+    if (_cast_common_dtype_to_outputs) {
+      check_all_same_dtype_ = false;
+    }
+    return *this;
+  }
+
+  TensorIteratorConfig& resize_outputs(bool resize_outputs) {
+    resize_outputs_ = resize_outputs;
+    return *this;
+  }
+
+  // Bypass output dtype/device computation and fix the dtype/device as
+  // specified here.
+  TensorIteratorConfig& declare_static_dtype_and_device(
+      ScalarType dtype,
+      Device device);
+  TensorIteratorConfig& declare_static_dtype(ScalarType dtype);
+  TensorIteratorConfig& declare_static_device(Device device);
+  TensorIteratorConfig& declare_static_shape(IntArrayRef shape);
+  TensorIteratorConfig& declare_static_shape(
+      IntArrayRef shape,
+      IntArrayRef squash_dims);
+
+  // It would be better if this was && qualified, but this would be at the cost
+  // of a lot of boilerplate above
+  TensorIterator build() {
+    TensorIterator iter;
+    iter.build(*this);
+    return iter;
+  }
+
+ private:
+  bool is_tensor_const(size_t idx);
+
+  SmallVector<c10::MaybeOwned<TensorBase>, 4> tensors_;
+  int num_outputs_ = 0;
+  int num_inputs_ = 0;
+
+  c10::optional<DimVector> static_shape_ = c10::nullopt;
+  c10::optional<ScalarType> static_dtype_ = c10::nullopt;
+  c10::optional<Device> static_device_ = c10::nullopt;
+  bool check_mem_overlap_ = true;
+  bool allow_cpu_scalars_ = false;
+  bool is_reduction_ = false;
+  bool resize_outputs_ = true;
+  bool check_all_same_dtype_ = true;
+  bool check_all_same_device_ = true;
+  bool enforce_safe_casting_to_output_ = false;
+  bool enforce_linear_iteration_ = false;
+  bool promote_inputs_to_common_dtype_ = false;
+  bool promote_integer_inputs_to_float_ = false;
+  bool cast_common_dtype_to_outputs_ = false;
+
+  SmallVector<size_t, 4> const_tensor_indices_;
+};
+
+/// A container-like struct that acts as if it contains splits of a
+/// TensorIterator that can use 32-bit indexing. Taken together the splits cover
+/// the original TensorIterator.
+struct TORCH_API SplitUntil32Bit {
+  struct TORCH_API iterator {
+    iterator() = default;
+    iterator(const TensorIteratorBase& iter);
+    iterator(iterator&&) = default;
+
+    // Guaranteed to be a TensorIterator proper!
+    TensorIterator& operator*() const;
+    iterator& operator++();
+    bool operator==(const iterator& other) const {
+      // two iterators are equal if they are the same object or they're both
+      // empty
+      return this == &other || (vec.empty() && other.vec.empty());
+    }
+    // needed for C++11 range-based for loop
+    bool operator!=(const iterator& other) const {
+      return !(*this == other);
+    }
+
+    /// stack of TensorIterators to be split
+    std::vector<std::unique_ptr<TensorIterator>> vec;
+  };
+
+  SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {}
+
+  iterator begin() const;
+  iterator end() const;
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const TensorIteratorBase& iter;
+};
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h
new file mode 100644
index 0000000000000000000000000000000000000000..4935f49ae2726389441e4012cc15bcf3981f2e84
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/div_rtn.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// Integer division rounding to -Infinity
+template <typename T>
+static inline T div_rtn(T x, T y) {
+  int q = x / y;
+  int r = x % y;
+  if ((r != 0) && ((r < 0) != (y < 0)))
+    --q;
+  return q;
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..9af826549021a0853beb83c74b6ac695728ab054
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jit_macros.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <ATen/cuda/CUDAConfig.h>
+#include <string>
+
+// AT_USE_JITERATOR(), controls whether we jit some elementwise kernels
+#define AT_USE_JITERATOR() true
+#define jiterator_stringify(...) std::string(#__VA_ARGS__);
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..3aa4c7ebb0af07fd65012d9d531aaa140dd6c212
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/jiterator_macros.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <string>
+
+#define JITERATOR_HOST_DEVICE C10_HOST_DEVICE
+#if defined(_MSC_VER) && defined(__CUDACC__)
+// NVRTC on Windows errors if __host__ __device__ attribute is
+// present on kernel.
+// error: attribute "__host__" does not apply here
+// error: attribute "__device__" does not apply here
+#define JITERATOR_HOST_DEVICE
+#endif
+
+// jiterator_also_stringify_as macro is used to define code (for CPU/ROCm)
+// and generate code string for `jiterator` (only when compiling for CUDA).
+// Usage :
+//      jiterator_also_stringify_as(
+//          jiterator_code(template <typename T> T identity(T x) { return x; }),
+//          identity_string);
+// This will define the template `identity` as present in code and
+// also define `std::string identity_string` with the code as the string
+// if this is being compiled for CUDA.
+
+// `jiterator_code` macro is to deal with `,` in the kernel code.
+// These `,`s confuse the preprocessor into thinking we are passing
+// multiple arguments to the macro.
+#define jiterator_code(...) __VA_ARGS__
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// CPU and CUDA and ROCm case
+#define stringify_code(...) #__VA_ARGS__
+#define jiterator_also_stringify_as(code, str_name) \
+  code /* define the function */                    \
+      const std::string str_name = std::string(stringify_code(code));
+#else
+// CPU only or CPU and ROCm case
+// Only needs the function
+#define jiterator_also_stringify_as(code, str_name) code
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d1c4f1dc7ddc8997f7cc1297ef20d74de67afe0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/caffe2/serialize/crc_alt.h
@@ -0,0 +1,1343 @@
+#pragma once
+
+// //////////////////////////////////////////////////////////
+// Crc32.h
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table by undefining these lines:
+#define CRC32_USE_LOOKUP_TABLE_BYTE
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte_tableless and crc32_1byte_tableless2 don't need it at all
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+// using the aforementioned #defines the table is automatically fitted to your needs
+
+// uint8_t, uint32_t, int32_t
+#include <stdint.h>
+// size_t
+#include <cstddef>
+
+// crc32_fast selects the fastest algorithm depending on flags (CRC32_USE_LOOKUP_...)
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast    (const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (half-byte algoritm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte   (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32 = 0, size_t prefetchAhead = 256);
+#endif
+
+// //////////////////////////////////////////////////////////
+// Crc32.cpp
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table:
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+
+
+#ifndef __LITTLE_ENDIAN
+  #define __LITTLE_ENDIAN 1234
+#endif
+#ifndef __BIG_ENDIAN
+  #define __BIG_ENDIAN    4321
+#endif
+
+// define endianess and some integer data types
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  // Windows always little endian
+  #define __BYTE_ORDER __LITTLE_ENDIAN
+
+  // intrinsics / prefetching
+  #if defined(_M_ARM64)
+    #include <intrin.h>
+  #else
+    #include <xmmintrin.h>
+  #endif
+
+  #ifdef __MINGW32__
+    #define PREFETCH(location) __builtin_prefetch(location)
+  #else
+    #if defined(_M_ARM64)
+      #define PREFETCH(location) __prefetch(location)
+    #else
+      #define PREFETCH(location) _mm_prefetch(location, _MM_HINT_T0)
+    #endif
+  #endif
+#elif defined(__APPLE__)
+  #include <TargetConditionals.h>
+    #if TARGET_IPHONE_SIMULATOR
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_IPHONE
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_MAC
+      #include <machine/endian.h>
+      #if defined(__BIG_ENDIAN__)
+          #define __BYTE_ORDER __BIG_ENDIAN
+      #endif
+      #if defined(__LITTLE_ENDIAN__)
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+      #endif
+    #else
+      # error "Unknown Apple platform"
+    #endif
+#elif defined(__ARMEB__)
+  #define __BYTE_ORDER __BIG_ENDIAN
+#elif (defined(__BYTE_ORDER__) and !defined(__BYTE_ORDER))
+    #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        #define __BYTE_ORDER __BIG_ENDIAN
+    #else
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+    #endif
+#else
+  // defines __BYTE_ORDER as __LITTLE_ENDIAN or __BIG_ENDIAN
+  #include <sys/param.h>
+#endif
+
+// intrinsics / prefetching
+#ifdef __GNUC__
+  #define PREFETCH(location) __builtin_prefetch(location)
+#else
+#ifndef PREFETCH
+  // no prefetching
+  #define PREFETCH(location) ;
+#endif
+#endif
+
+// abort if byte order is undefined
+#ifndef __BYTE_ORDER
+#error undefined byte order, compile with -D__BYTE_ORDER=1234 (if little endian) or -D__BYTE_ORDER=4321 (big endian)
+#endif
+
+
+namespace
+{
+  /// zlib's CRC32 polynomial
+  const uint32_t Polynomial = 0xEDB88320;
+
+  /// swap endianess
+  static inline uint32_t swap(uint32_t x)
+  {
+  #if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(x);
+  #else
+    return (x >> 24) |
+          ((x >>  8) & 0x0000FF00) |
+          ((x <<  8) & 0x00FF0000) |
+           (x << 24);
+  #endif
+  }
+
+  /// Slicing-By-16
+  #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  const size_t MaxSlice = 16;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  const size_t MaxSlice = 8;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  const size_t MaxSlice = 4;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  const size_t MaxSlice = 1;
+  #else
+    #define NO_LUT // don't need Crc32Lookup at all
+  #endif
+
+} // anonymous namespace
+
+#ifndef NO_LUT
+/// forward declaration, table is at the end of this file
+extern const uint32_t Crc32Lookup[MaxSlice][256]; // extern is needed to keep compiler happy
+#endif
+
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc ^= *current++;
+
+    for (int j = 0; j < 8; j++)
+    {
+      // branch-free
+      crc = (crc >> 1) ^ (-int32_t(crc & 1) & Polynomial);
+
+      // branching, much slower:
+      //if (crc & 1)
+      //  crc = (crc >> 1) ^ Polynomial;
+      //else
+      //  crc =  crc >> 1;
+    }
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (half-byte algoritm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  /// look-up table for half-byte, same as crc32Lookup[0][16*i]
+  static const uint32_t Crc32Lookup16[16] =
+  {
+    0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C,
+    0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C
+  };
+
+  while (length-- != 0)
+  {
+    crc = Crc32Lookup16[(crc ^  *current      ) & 0x0F] ^ (crc >> 4);
+    crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4);
+    current++;
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *current++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    uint8_t s = uint8_t(crc) ^ *current++;
+
+    // Hagai Gold made me aware of this table-less algorithm and send me code
+
+    // polynomial 0xEDB88320 can be written in binary as 11101101101110001000001100100000b
+    // reverse the bits (or just assume bit 0 is the first one)
+    // and we have bits set at position 0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 16, 22, 23, 26
+    // => those are the shift offsets:
+    //crc = (crc >> 8) ^
+    //       t ^
+    //      (t >>  1) ^ (t >>  2) ^ (t >>  4) ^ (t >>  5) ^  // == y
+    //      (t >>  7) ^ (t >>  8) ^ (t >> 10) ^ (t >> 11) ^  // == y >> 6
+    //      (t >> 12) ^ (t >> 16) ^                          // == z
+    //      (t >> 22) ^ (t >> 26) ^                          // == z >> 10
+    //      (t >> 23);
+
+    // the fastest I can come up with:
+    uint32_t low = (s ^ (s << 6)) & 0xFF;
+    uint32_t a   = (low * ((1 << 23) + (1 << 14) + (1 << 2)));
+    crc = (crc >> 8) ^
+          (low * ((1 << 24) + (1 << 16) + (1 << 8))) ^
+           a ^
+          (a >> 1) ^
+          (low * ((1 << 20) + (1 << 12)           )) ^
+          (low << 19) ^
+          (low << 17) ^
+          (low >>  2);
+
+    // Hagai's code:
+    /*uint32_t t = (s ^ (s << 6)) << 24;
+    // some temporaries to optimize XOR
+    uint32_t x = (t >> 1) ^ (t >> 2);
+    uint32_t y = x ^ (x >> 3);
+    uint32_t z = (t >> 12) ^ (t >> 16);
+    crc = (crc >> 8) ^
+           t ^ (t >> 23) ^
+           y ^ (y >>  6) ^
+           z ^ (z >> 10);*/
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32)
+{
+  int32_t crc = ~previousCrc32; // note: signed integer, right shift distributes sign bit into lower bits
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc = crc ^ *current++;
+
+    uint32_t c = (((crc << 31) >> 31) & ((Polynomial >> 7)  ^ (Polynomial >> 1))) ^
+                 (((crc << 30) >> 31) & ((Polynomial >> 6)  ^  Polynomial)) ^
+                 (((crc << 29) >> 31) &  (Polynomial >> 5)) ^
+                 (((crc << 28) >> 31) &  (Polynomial >> 4)) ^
+                 (((crc << 27) >> 31) &  (Polynomial >> 3)) ^
+                 (((crc << 26) >> 31) &  (Polynomial >> 2)) ^
+                 (((crc << 25) >> 31) &  (Polynomial >> 1)) ^
+                 (((crc << 24) >> 31) &   Polynomial);
+
+    crc = ((uint32_t)crc >> 8) ^ c; // convert to unsigned integer before right shift
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t  crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process four bytes at once (Slicing-by-4)
+  while (length >= 4)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    crc = Crc32Lookup[0][ one      & 0xFF] ^
+          Crc32Lookup[1][(one>> 8) & 0xFF] ^
+          Crc32Lookup[2][(one>>16) & 0xFF] ^
+          Crc32Lookup[3][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    crc = Crc32Lookup[0][(one>>24) & 0xFF] ^
+          Crc32Lookup[1][(one>>16) & 0xFF] ^
+          Crc32Lookup[2][(one>> 8) & 0xFF] ^
+          Crc32Lookup[3][ one      & 0xFF];
+#endif
+
+    length -= 4;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 3 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process eight bytes at once (Slicing-by-8)
+  while (length >= 8)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][ two      & 0xFF] ^
+          Crc32Lookup[1][(two>> 8) & 0xFF] ^
+          Crc32Lookup[2][(two>>16) & 0xFF] ^
+          Crc32Lookup[3][(two>>24) & 0xFF] ^
+          Crc32Lookup[4][ one      & 0xFF] ^
+          Crc32Lookup[5][(one>> 8) & 0xFF] ^
+          Crc32Lookup[6][(one>>16) & 0xFF] ^
+          Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+          Crc32Lookup[1][(two>>16) & 0xFF] ^
+          Crc32Lookup[2][(two>> 8) & 0xFF] ^
+          Crc32Lookup[3][ two      & 0xFF] ^
+          Crc32Lookup[4][(one>>24) & 0xFF] ^
+          Crc32Lookup[5][(one>>16) & 0xFF] ^
+          Crc32Lookup[6][(one>> 8) & 0xFF] ^
+          Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    length -= 8;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 7 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 8 * Unroll;
+
+  // process 4x eight bytes at once (Slicing-by-8)
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+      uint32_t one = *current++ ^ swap(crc);
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][ two      & 0xFF] ^
+            Crc32Lookup[1][(two>> 8) & 0xFF] ^
+            Crc32Lookup[2][(two>>16) & 0xFF] ^
+            Crc32Lookup[3][(two>>24) & 0xFF] ^
+            Crc32Lookup[4][ one      & 0xFF] ^
+            Crc32Lookup[5][(one>> 8) & 0xFF] ^
+            Crc32Lookup[6][(one>>16) & 0xFF] ^
+            Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+      uint32_t one = *current++ ^ crc;
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+            Crc32Lookup[1][(two>>16) & 0xFF] ^
+            Crc32Lookup[2][(two>> 8) & 0xFF] ^
+            Crc32Lookup[3][ two      & 0xFF] ^
+            Crc32Lookup[4][(one>>24) & 0xFF] ^
+            Crc32Lookup[5][(one>>16) & 0xFF] ^
+            Crc32Lookup[6][(one>> 8) & 0xFF] ^
+            Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 31 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32, size_t prefetchAhead)
+{
+  // CRC code is identical to crc32_16bytes (including unrolling), only added prefetching
+  // 256 bytes look-ahead seems to be the sweet spot on Core i7 CPUs
+
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce + prefetchAhead)
+  {
+    PREFETCH(((const char*) current) + prefetchAhead);
+
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast(const void* data, size_t length, uint32_t previousCrc32)
+{
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  return crc32_16bytes (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  return crc32_8bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  return crc32_4bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  return crc32_1byte   (data, length, previousCrc32);
+#else
+  return crc32_halfbyte(data, length, previousCrc32);
+#endif
+}
+
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
+{
+  // based on Mark Adler's crc_combine from
+  // https://github.com/madler/pigz/blob/master/pigz.c
+
+  // main idea:
+  // - if you have two equally-sized blocks A and B,
+  //   then you can create a block C = A ^ B
+  //   which has the property crc(C) = crc(A) ^ crc(B)
+  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
+  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
+  //   then exists a C' = A' ^ B'
+  // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
+  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
+  // - the trick is to compute crc(A') based on crc(A)
+  //                       and crc(B') based on crc(B)
+  // - since B' starts with many zeros, the crc of those initial zeros is still zero
+  // - that means crc(B') = crc(B)
+  // - unfortunately the trailing zeros of A' change the crc, so usually crc(A') != crc(A)
+  // - the following code is a fast algorithm to compute crc(A')
+  // - starting with crc(A) and appending length(B) zeros, needing just log2(length(B)) iterations
+  // - the details are explained by the original author at
+  //   https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768
+  //
+  // notes:
+  // - I squeezed everything into one function to keep global namespace clean (original code two helper functions)
+  // - most original comments are still in place, I added comments where these helper functions where made inline code
+  // - performance-wise there isn't any differenze to the original zlib/pigz code
+
+  // degenerated case
+  if (lengthB == 0)
+    return crcA;
+
+  /// CRC32 => 32 bits
+  const uint32_t CrcBits = 32;
+
+  uint32_t odd [CrcBits]; // odd-power-of-two  zeros operator
+  uint32_t even[CrcBits]; // even-power-of-two zeros operator
+
+  // put operator for one zero bit in odd
+  odd[0] = Polynomial;    // CRC-32 polynomial
+  for (uint32_t i = 1; i < CrcBits; i++)
+    odd[i] = 1 << (i - 1);
+
+  // put operator for two zero bits in even
+  // same as gf2_matrix_square(even, odd);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = odd[i];
+    even[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        even[i] ^= odd[j];
+  }
+  // put operator for four zero bits in odd
+  // same as gf2_matrix_square(odd, even);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = even[i];
+    odd[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        odd[i] ^= even[j];
+  }
+
+  // the following loop becomes much shorter if I keep swapping even and odd
+  uint32_t* a = even;
+  uint32_t* b = odd;
+  // apply secondLength zeros to firstCrc32
+  for (; lengthB > 0; lengthB >>= 1)
+  {
+    // same as gf2_matrix_square(a, b);
+    for (uint32_t i = 0; i < CrcBits; i++)
+    {
+      uint32_t vec = b[i];
+      a[i] = 0;
+      for (int j = 0; vec != 0; j++, vec >>= 1)
+        if (vec & 1)
+          a[i] ^= b[j];
+    }
+
+    // apply zeros operator for this bit
+    if (lengthB & 1)
+    {
+      // same as firstCrc32 = gf2_matrix_times(a, firstCrc32);
+      uint32_t sum = 0;
+      for (int i = 0; crcA != 0; i++, crcA >>= 1)
+        if (crcA & 1)
+          sum ^= a[i];
+      crcA = sum;
+    }
+
+    // switch even and odd
+    uint32_t* t = a; a = b; b = t;
+  }
+
+  // return combined crc
+  return crcA ^ crcB;
+}
+
+
+// //////////////////////////////////////////////////////////
+// constants
+
+
+#ifndef NO_LUT
+/// look-up table, already declared above
+const uint32_t Crc32Lookup[MaxSlice][256] =
+{
+  //// same algorithm as crc32_bitwise
+  //for (int i = 0; i <= 0xFF; i++)
+  //{
+  //  uint32_t crc = i;
+  //  for (int j = 0; j < 8; j++)
+  //    crc = (crc >> 1) ^ ((crc & 1) * Polynomial);
+  //  Crc32Lookup[0][i] = crc;
+  //}
+  //// ... and the following slicing-by-8 algorithm (from Intel):
+  //// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+  //// http://sourceforge.net/projects/slicing-by-8/
+  //for (int slice = 1; slice < MaxSlice; slice++)
+  //  Crc32Lookup[slice][i] = (Crc32Lookup[slice - 1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[slice - 1][i] & 0xFF];
+  {
+    // note: the first number of every second row corresponds to the half-byte look-up table !
+    0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3,
+    0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91,
+    0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7,
+    0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,
+    0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,
+    0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,
+    0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
+    0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D,
+    0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433,
+    0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01,
+    0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,
+    0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,
+    0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,
+    0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
+    0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F,
+    0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD,
+    0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683,
+    0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,
+    0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,
+    0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,
+    0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
+    0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79,
+    0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F,
+    0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,
+    0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,
+    0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,
+    0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,
+    0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
+    0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB,
+    0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9,
+    0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF,
+    0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D,
+  }
+
+#if defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-4, Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7,
+    0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF,
+    0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496,
+    0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E,
+    0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265,
+    0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D,
+    0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034,
+    0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C,
+    0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2,
+    0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA,
+    0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93,
+    0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B,
+    0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60,
+    0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768,
+    0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31,
+    0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539,
+    0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C,
+    0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484,
+    0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD,
+    0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5,
+    0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E,
+    0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026,
+    0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F,
+    0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277,
+    0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189,
+    0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81,
+    0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8,
+    0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0,
+    0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B,
+    0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23,
+    0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A,
+    0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72,
+  },
+
+  {
+    0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685,
+    0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D,
+    0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5,
+    0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D,
+    0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065,
+    0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD,
+    0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315,
+    0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD,
+    0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45,
+    0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD,
+    0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835,
+    0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D,
+    0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5,
+    0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D,
+    0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5,
+    0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D,
+    0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05,
+    0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD,
+    0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75,
+    0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD,
+    0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5,
+    0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D,
+    0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895,
+    0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D,
+    0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5,
+    0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D,
+    0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5,
+    0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D,
+    0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625,
+    0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D,
+    0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555,
+    0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED,
+  },
+
+  {
+    0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9,
+    0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056,
+    0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26,
+    0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9,
+    0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787,
+    0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68,
+    0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018,
+    0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7,
+    0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084,
+    0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B,
+    0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B,
+    0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4,
+    0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA,
+    0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755,
+    0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825,
+    0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA,
+    0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82,
+    0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D,
+    0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D,
+    0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2,
+    0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC,
+    0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953,
+    0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623,
+    0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC,
+    0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF,
+    0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50,
+    0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120,
+    0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF,
+    0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981,
+    0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E,
+    0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E,
+    0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1,
+  }
+#endif // defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+#if defined (CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10,
+    0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1,
+    0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92,
+    0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053,
+    0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314,
+    0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5,
+    0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496,
+    0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57,
+    0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459,
+    0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98,
+    0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB,
+    0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A,
+    0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D,
+    0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C,
+    0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF,
+    0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E,
+    0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82,
+    0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743,
+    0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00,
+    0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1,
+    0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386,
+    0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847,
+    0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404,
+    0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5,
+    0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB,
+    0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A,
+    0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349,
+    0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888,
+    0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF,
+    0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E,
+    0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D,
+    0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C,
+  },
+
+  {
+    0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8,
+    0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5,
+    0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223,
+    0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E,
+    0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E,
+    0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3,
+    0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715,
+    0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578,
+    0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4,
+    0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9,
+    0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F,
+    0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22,
+    0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2,
+    0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F,
+    0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79,
+    0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14,
+    0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460,
+    0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D,
+    0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB,
+    0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496,
+    0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156,
+    0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B,
+    0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD,
+    0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0,
+    0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C,
+    0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61,
+    0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97,
+    0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA,
+    0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A,
+    0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957,
+    0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1,
+    0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC,
+  },
+
+  {
+    0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E,
+    0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9,
+    0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240,
+    0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27,
+    0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712,
+    0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975,
+    0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC,
+    0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB,
+    0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7,
+    0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590,
+    0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739,
+    0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E,
+    0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B,
+    0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C,
+    0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5,
+    0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2,
+    0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C,
+    0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B,
+    0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2,
+    0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5,
+    0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0,
+    0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387,
+    0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E,
+    0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49,
+    0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105,
+    0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62,
+    0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB,
+    0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC,
+    0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899,
+    0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE,
+    0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457,
+    0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30,
+  },
+
+  {
+    0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919,
+    0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC,
+    0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832,
+    0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387,
+    0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F,
+    0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA,
+    0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64,
+    0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1,
+    0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4,
+    0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041,
+    0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF,
+    0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A,
+    0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2,
+    0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217,
+    0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889,
+    0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C,
+    0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3,
+    0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776,
+    0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8,
+    0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D,
+    0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95,
+    0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520,
+    0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE,
+    0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B,
+    0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E,
+    0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B,
+    0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05,
+    0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0,
+    0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78,
+    0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD,
+    0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53,
+    0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 || CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  // beyond this point only relevant for Slicing-by-16
+  ,{
+    0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9,
+    0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1,
+    0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8,
+    0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0,
+    0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A,
+    0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72,
+    0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B,
+    0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03,
+    0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE,
+    0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6,
+    0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF,
+    0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7,
+    0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D,
+    0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75,
+    0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C,
+    0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04,
+    0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86,
+    0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E,
+    0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7,
+    0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF,
+    0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25,
+    0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D,
+    0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54,
+    0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C,
+    0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81,
+    0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99,
+    0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0,
+    0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8,
+    0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22,
+    0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A,
+    0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53,
+    0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B,
+  },
+
+  {
+    0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79,
+    0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D,
+    0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91,
+    0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65,
+    0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9,
+    0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D,
+    0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941,
+    0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5,
+    0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9,
+    0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D,
+    0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31,
+    0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5,
+    0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09,
+    0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD,
+    0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1,
+    0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15,
+    0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278,
+    0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C,
+    0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390,
+    0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364,
+    0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8,
+    0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C,
+    0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040,
+    0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4,
+    0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8,
+    0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C,
+    0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430,
+    0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4,
+    0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608,
+    0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC,
+    0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0,
+    0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714,
+  },
+
+  {
+    0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583,
+    0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6,
+    0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148,
+    0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D,
+    0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54,
+    0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11,
+    0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F,
+    0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA,
+    0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C,
+    0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29,
+    0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7,
+    0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2,
+    0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB,
+    0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE,
+    0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770,
+    0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635,
+    0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C,
+    0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159,
+    0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7,
+    0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592,
+    0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB,
+    0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E,
+    0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00,
+    0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45,
+    0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3,
+    0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6,
+    0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38,
+    0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D,
+    0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624,
+    0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761,
+    0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF,
+    0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA,
+  },
+
+  {
+    0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F,
+    0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71,
+    0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473,
+    0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D,
+    0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277,
+    0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489,
+    0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B,
+    0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975,
+    0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F,
+    0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881,
+    0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383,
+    0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D,
+    0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587,
+    0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379,
+    0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B,
+    0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85,
+    0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F,
+    0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091,
+    0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93,
+    0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D,
+    0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97,
+    0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69,
+    0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B,
+    0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695,
+    0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F,
+    0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761,
+    0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63,
+    0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D,
+    0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67,
+    0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99,
+    0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B,
+    0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165,
+  },
+
+  {
+    0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658,
+    0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535,
+    0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082,
+    0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF,
+    0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD,
+    0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0,
+    0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77,
+    0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A,
+    0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3,
+    0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E,
+    0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129,
+    0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244,
+    0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06,
+    0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B,
+    0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC,
+    0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1,
+    0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F,
+    0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022,
+    0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595,
+    0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8,
+    0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA,
+    0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7,
+    0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60,
+    0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D,
+    0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4,
+    0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189,
+    0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E,
+    0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753,
+    0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911,
+    0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C,
+    0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB,
+    0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6,
+  },
+
+  {
+    0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0,
+    0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6,
+    0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC,
+    0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A,
+    0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218,
+    0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E,
+    0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74,
+    0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042,
+    0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31,
+    0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307,
+    0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D,
+    0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B,
+    0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9,
+    0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF,
+    0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985,
+    0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3,
+    0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522,
+    0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14,
+    0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E,
+    0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778,
+    0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA,
+    0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC,
+    0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196,
+    0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0,
+    0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3,
+    0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5,
+    0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF,
+    0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089,
+    0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B,
+    0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D,
+    0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667,
+    0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851,
+  },
+
+  {
+    0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A,
+    0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D,
+    0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055,
+    0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2,
+    0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184,
+    0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03,
+    0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB,
+    0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C,
+    0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467,
+    0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0,
+    0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28,
+    0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF,
+    0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9,
+    0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E,
+    0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6,
+    0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931,
+    0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1,
+    0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326,
+    0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE,
+    0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69,
+    0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F,
+    0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8,
+    0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70,
+    0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7,
+    0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC,
+    0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B,
+    0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93,
+    0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714,
+    0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42,
+    0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5,
+    0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D,
+    0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A,
+  },
+
+  {
+    0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875,
+    0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A,
+    0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB,
+    0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4,
+    0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308,
+    0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47,
+    0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96,
+    0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9,
+    0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F,
+    0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0,
+    0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011,
+    0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E,
+    0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2,
+    0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD,
+    0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C,
+    0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423,
+    0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581,
+    0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE,
+    0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F,
+    0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450,
+    0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC,
+    0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3,
+    0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062,
+    0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D,
+    0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B,
+    0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34,
+    0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5,
+    0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA,
+    0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806,
+    0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749,
+    0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698,
+    0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+};
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfb535f1c9e25d133e98253370c917a306c57119
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/cpuinfo.h
@@ -0,0 +1,1956 @@
+#pragma once
+#ifndef CPUINFO_H
+#define CPUINFO_H
+
+#ifndef __cplusplus
+	#include <stdbool.h>
+#endif
+
+#ifdef __APPLE__
+	#include <TargetConditionals.h>
+#endif
+
+#include <stdint.h>
+
+/* Identify architecture and define corresponding macro */
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86)
+	#define CPUINFO_ARCH_X86 1
+#endif
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+	#define CPUINFO_ARCH_X86_64 1
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+	#define CPUINFO_ARCH_ARM 1
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+	#define CPUINFO_ARCH_ARM64 1
+#endif
+
+#if defined(__PPC64__) || defined(__powerpc64__) || defined(_ARCH_PPC64)
+	#define CPUINFO_ARCH_PPC64 1
+#endif
+
+#if defined(__asmjs__)
+	#define CPUINFO_ARCH_ASMJS 1
+#endif
+
+#if defined(__wasm__)
+	#if defined(__wasm_simd128__)
+		#define CPUINFO_ARCH_WASMSIMD 1
+	#else
+		#define CPUINFO_ARCH_WASM 1
+	#endif
+#endif
+
+/* Define other architecture-specific macros as 0 */
+
+#ifndef CPUINFO_ARCH_X86
+	#define CPUINFO_ARCH_X86 0
+#endif
+
+#ifndef CPUINFO_ARCH_X86_64
+	#define CPUINFO_ARCH_X86_64 0
+#endif
+
+#ifndef CPUINFO_ARCH_ARM
+	#define CPUINFO_ARCH_ARM 0
+#endif
+
+#ifndef CPUINFO_ARCH_ARM64
+	#define CPUINFO_ARCH_ARM64 0
+#endif
+
+#ifndef CPUINFO_ARCH_PPC64
+	#define CPUINFO_ARCH_PPC64 0
+#endif
+
+#ifndef CPUINFO_ARCH_ASMJS
+	#define CPUINFO_ARCH_ASMJS 0
+#endif
+
+#ifndef CPUINFO_ARCH_WASM
+	#define CPUINFO_ARCH_WASM 0
+#endif
+
+#ifndef CPUINFO_ARCH_WASMSIMD
+	#define CPUINFO_ARCH_WASMSIMD 0
+#endif
+
+#if CPUINFO_ARCH_X86 && defined(_MSC_VER)
+	#define CPUINFO_ABI __cdecl
+#elif CPUINFO_ARCH_X86 && defined(__GNUC__)
+	#define CPUINFO_ABI __attribute__((__cdecl__))
+#else
+	#define CPUINFO_ABI
+#endif
+
+#define CPUINFO_CACHE_UNIFIED          0x00000001
+#define CPUINFO_CACHE_INCLUSIVE        0x00000002
+#define CPUINFO_CACHE_COMPLEX_INDEXING 0x00000004
+
+struct cpuinfo_cache {
+	/** Cache size in bytes */
+	uint32_t size;
+	/** Number of ways of associativity */
+	uint32_t associativity;
+	/** Number of sets */
+	uint32_t sets;
+	/** Number of partitions */
+	uint32_t partitions;
+	/** Line size in bytes */
+	uint32_t line_size;
+	/**
+	 * Binary characteristics of the cache (unified cache, inclusive cache, cache with complex indexing).
+	 *
+	 * @see CPUINFO_CACHE_UNIFIED, CPUINFO_CACHE_INCLUSIVE, CPUINFO_CACHE_COMPLEX_INDEXING
+	 */
+	uint32_t flags;
+	/** Index of the first logical processor that shares this cache */
+	uint32_t processor_start;
+	/** Number of logical processors that share this cache */
+	uint32_t processor_count;
+};
+
+struct cpuinfo_trace_cache {
+	uint32_t uops;
+	uint32_t associativity;
+};
+
+#define CPUINFO_PAGE_SIZE_4KB  0x1000
+#define CPUINFO_PAGE_SIZE_1MB  0x100000
+#define CPUINFO_PAGE_SIZE_2MB  0x200000
+#define CPUINFO_PAGE_SIZE_4MB  0x400000
+#define CPUINFO_PAGE_SIZE_16MB 0x1000000
+#define CPUINFO_PAGE_SIZE_1GB  0x40000000
+
+struct cpuinfo_tlb {
+	uint32_t entries;
+	uint32_t associativity;
+	uint64_t pages;
+};
+
+/** Vendor of processor core design */
+enum cpuinfo_vendor {
+	/** Processor vendor is not known to the library, or the library failed to get vendor information from the OS. */
+	cpuinfo_vendor_unknown = 0,
+
+	/* Active vendors of modern CPUs */
+
+	/**
+	 * Intel Corporation. Vendor of x86, x86-64, IA64, and ARM processor microarchitectures.
+	 *
+	 * Sold its ARM design subsidiary in 2006. The last ARM processor design was released in 2004.
+	 */
+	cpuinfo_vendor_intel    = 1,
+	/** Advanced Micro Devices, Inc. Vendor of x86 and x86-64 processor microarchitectures. */
+	cpuinfo_vendor_amd      = 2,
+	/** ARM Holdings plc. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_arm      = 3,
+	/** Qualcomm Incorporated. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_qualcomm = 4,
+	/** Apple Inc. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_apple    = 5,
+	/** Samsung Electronics Co., Ltd. Vendir if ARM64 processor microarchitectures. */
+	cpuinfo_vendor_samsung  = 6,
+	/** Nvidia Corporation. Vendor of ARM64-compatible processor microarchitectures. */
+	cpuinfo_vendor_nvidia   = 7,
+	/** MIPS Technologies, Inc. Vendor of MIPS processor microarchitectures. */
+	cpuinfo_vendor_mips     = 8,
+	/** International Business Machines Corporation. Vendor of PowerPC processor microarchitectures. */
+	cpuinfo_vendor_ibm      = 9,
+	/** Ingenic Semiconductor. Vendor of MIPS processor microarchitectures. */
+	cpuinfo_vendor_ingenic  = 10,
+	/**
+	 * VIA Technologies, Inc. Vendor of x86 and x86-64 processor microarchitectures.
+	 *
+	 * Processors are designed by Centaur Technology, a subsidiary of VIA Technologies.
+	 */
+	cpuinfo_vendor_via      = 11,
+	/** Cavium, Inc. Vendor of ARM64 processor microarchitectures. */
+	cpuinfo_vendor_cavium   = 12,
+	/** Broadcom, Inc. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_broadcom = 13,
+	/** Applied Micro Circuits Corporation (APM). Vendor of ARM64 processor microarchitectures. */
+	cpuinfo_vendor_apm      = 14,
+	/**
+	 * Huawei Technologies Co., Ltd. Vendor of ARM64 processor microarchitectures.
+	 *
+	 * Processors are designed by HiSilicon, a subsidiary of Huawei.
+	 */
+	cpuinfo_vendor_huawei   = 15,
+	/**
+	 * Hygon (Chengdu Haiguang Integrated Circuit Design Co., Ltd), Vendor of x86-64 processor microarchitectures.
+	 *
+	 * Processors are variants of AMD cores.
+	 */
+	cpuinfo_vendor_hygon    = 16,
+
+	/* Active vendors of embedded CPUs */
+
+	/** Texas Instruments Inc. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_texas_instruments = 30,
+	/** Marvell Technology Group Ltd. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_marvell           = 31,
+	/** RDC Semiconductor Co., Ltd. Vendor of x86 processor microarchitectures. */
+	cpuinfo_vendor_rdc               = 32,
+	/** DM&P Electronics Inc. Vendor of x86 processor microarchitectures. */
+	cpuinfo_vendor_dmp               = 33,
+	/** Motorola, Inc. Vendor of PowerPC and ARM processor microarchitectures. */
+	cpuinfo_vendor_motorola          = 34,
+
+	/* Defunct CPU vendors */
+
+	/**
+	 * Transmeta Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 2004.
+	 * Transmeta processors implemented VLIW ISA and used binary translation to execute x86 code.
+	 */
+	cpuinfo_vendor_transmeta = 50,
+	/**
+	 * Cyrix Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1996.
+	 */
+	cpuinfo_vendor_cyrix     = 51,
+	/**
+	 * Rise Technology. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1999.
+	 */
+	cpuinfo_vendor_rise      = 52,
+	/**
+	 * National Semiconductor. Vendor of x86 processor microarchitectures.
+	 *
+	 * Sold its x86 design subsidiary in 1999. The last processor design was released in 1998.
+	 */
+	cpuinfo_vendor_nsc       = 53,
+	/**
+	 * Silicon Integrated Systems. Vendor of x86 processor microarchitectures.
+	 *
+	 * Sold its x86 design subsidiary in 2001. The last processor design was released in 2001.
+	 */
+	cpuinfo_vendor_sis       = 54,
+	/**
+	 * NexGen. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1994.
+	 * NexGen designed the first x86 microarchitecture which decomposed x86 instructions into simple microoperations.
+	 */
+	cpuinfo_vendor_nexgen    = 55,
+	/**
+	 * United Microelectronics Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Ceased x86 in the early 1990s. The last processor design was released in 1991.
+	 * Designed U5C and U5D processors. Both are 486 level.
+	 */
+	cpuinfo_vendor_umc       = 56,
+	/**
+	 * Digital Equipment Corporation. Vendor of ARM processor microarchitecture.
+	 *
+	 * Sold its ARM designs in 1997. The last processor design was released in 1997.
+	 */
+	cpuinfo_vendor_dec       = 57,
+};
+
+/**
+ * Processor microarchitecture
+ *
+ * Processors with different microarchitectures often have different instruction performance characteristics,
+ * and may have dramatically different pipeline organization.
+ */
+enum cpuinfo_uarch {
+	/** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */
+	cpuinfo_uarch_unknown = 0,
+
+	/** Pentium and Pentium MMX microarchitecture. */
+	cpuinfo_uarch_p5    = 0x00100100,
+	/** Intel Quark microarchitecture. */
+	cpuinfo_uarch_quark = 0x00100101,
+
+	/** Pentium Pro, Pentium II, and Pentium III. */
+	cpuinfo_uarch_p6           = 0x00100200,
+	/** Pentium M. */
+	cpuinfo_uarch_dothan       = 0x00100201,
+	/** Intel Core microarchitecture. */
+	cpuinfo_uarch_yonah        = 0x00100202,
+	/** Intel Core 2 microarchitecture on 65 nm process. */
+	cpuinfo_uarch_conroe       = 0x00100203,
+	/** Intel Core 2 microarchitecture on 45 nm process. */
+	cpuinfo_uarch_penryn       = 0x00100204,
+	/** Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). */
+	cpuinfo_uarch_nehalem      = 0x00100205,
+	/** Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). */
+	cpuinfo_uarch_sandy_bridge = 0x00100206,
+	/** Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). */
+	cpuinfo_uarch_ivy_bridge   = 0x00100207,
+	/** Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). */
+	cpuinfo_uarch_haswell      = 0x00100208,
+	/** Intel Broadwell microarchitecture. */
+	cpuinfo_uarch_broadwell    = 0x00100209,
+	/** Intel Sky Lake microarchitecture (14 nm, including Kaby/Coffee/Whiskey/Amber/Comet/Cascade/Cooper Lake). */
+	cpuinfo_uarch_sky_lake     = 0x0010020A,
+	/** DEPRECATED (Intel Kaby Lake microarchitecture). */
+	cpuinfo_uarch_kaby_lake    = 0x0010020A,
+	/** Intel Palm Cove microarchitecture (10 nm, Cannon Lake). */
+	cpuinfo_uarch_palm_cove    = 0x0010020B,
+	/** Intel Sunny Cove microarchitecture (10 nm, Ice Lake). */
+	cpuinfo_uarch_sunny_cove   = 0x0010020C,
+
+	/** Pentium 4 with Willamette, Northwood, or Foster cores. */
+	cpuinfo_uarch_willamette = 0x00100300,
+	/** Pentium 4 with Prescott and later cores. */
+	cpuinfo_uarch_prescott   = 0x00100301,
+
+	/** Intel Atom on 45 nm process. */
+	cpuinfo_uarch_bonnell       = 0x00100400,
+	/** Intel Atom on 32 nm process. */
+	cpuinfo_uarch_saltwell      = 0x00100401,
+	/** Intel Silvermont microarchitecture (22 nm out-of-order Atom). */
+	cpuinfo_uarch_silvermont    = 0x00100402,
+	/** Intel Airmont microarchitecture (14 nm out-of-order Atom). */
+	cpuinfo_uarch_airmont       = 0x00100403,
+	/** Intel Goldmont microarchitecture (Denverton, Apollo Lake). */
+	cpuinfo_uarch_goldmont      = 0x00100404,
+	/** Intel Goldmont Plus microarchitecture (Gemini Lake). */
+	cpuinfo_uarch_goldmont_plus = 0x00100405,
+
+	/** Intel Knights Ferry HPC boards. */
+	cpuinfo_uarch_knights_ferry   = 0x00100500,
+	/** Intel Knights Corner HPC boards (aka Xeon Phi). */
+	cpuinfo_uarch_knights_corner  = 0x00100501,
+	/** Intel Knights Landing microarchitecture (second-gen MIC). */
+	cpuinfo_uarch_knights_landing = 0x00100502,
+	/** Intel Knights Hill microarchitecture (third-gen MIC). */
+	cpuinfo_uarch_knights_hill    = 0x00100503,
+	/** Intel Knights Mill Xeon Phi. */
+	cpuinfo_uarch_knights_mill    = 0x00100504,
+
+	/** Intel/Marvell XScale series. */
+	cpuinfo_uarch_xscale = 0x00100600,
+
+	/** AMD K5. */
+	cpuinfo_uarch_k5        = 0x00200100,
+	/** AMD K6 and alike. */
+	cpuinfo_uarch_k6        = 0x00200101,
+	/** AMD Athlon and Duron. */
+	cpuinfo_uarch_k7        = 0x00200102,
+	/** AMD Athlon 64, Opteron 64. */
+	cpuinfo_uarch_k8        = 0x00200103,
+	/** AMD Family 10h (Barcelona, Istambul, Magny-Cours). */
+	cpuinfo_uarch_k10       = 0x00200104,
+	/**
+	 * AMD Bulldozer microarchitecture
+	 * Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs.
+	 */
+	cpuinfo_uarch_bulldozer = 0x00200105,
+	/**
+	 * AMD Piledriver microarchitecture
+	 * Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs.
+	 */
+	cpuinfo_uarch_piledriver  = 0x00200106,
+	/** AMD Steamroller microarchitecture (Kaveri APUs). */
+	cpuinfo_uarch_steamroller = 0x00200107,
+	/** AMD Excavator microarchitecture (Carizzo APUs). */
+	cpuinfo_uarch_excavator   = 0x00200108,
+	/** AMD Zen microarchitecture (12/14 nm Ryzen and EPYC CPUs). */
+	cpuinfo_uarch_zen         = 0x00200109,
+	/** AMD Zen 2 microarchitecture (7 nm Ryzen and EPYC CPUs). */
+	cpuinfo_uarch_zen2        = 0x0020010A,
+	/** AMD Zen 3 microarchitecture. */
+	cpuinfo_uarch_zen3        = 0x0020010B,
+	/** AMD Zen 4 microarchitecture. */
+	cpuinfo_uarch_zen4        = 0x0020010C,
+
+	/** NSC Geode and AMD Geode GX and LX. */
+	cpuinfo_uarch_geode  = 0x00200200,
+	/** AMD Bobcat mobile microarchitecture. */
+	cpuinfo_uarch_bobcat = 0x00200201,
+	/** AMD Jaguar mobile microarchitecture. */
+	cpuinfo_uarch_jaguar = 0x00200202,
+	/** AMD Puma mobile microarchitecture. */
+	cpuinfo_uarch_puma   = 0x00200203,
+
+	/** ARM7 series. */
+	cpuinfo_uarch_arm7  = 0x00300100,
+	/** ARM9 series. */
+	cpuinfo_uarch_arm9  = 0x00300101,
+	/** ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. */
+	cpuinfo_uarch_arm11 = 0x00300102,
+
+	/** ARM Cortex-A5. */
+	cpuinfo_uarch_cortex_a5  = 0x00300205,
+	/** ARM Cortex-A7. */
+	cpuinfo_uarch_cortex_a7  = 0x00300207,
+	/** ARM Cortex-A8. */
+	cpuinfo_uarch_cortex_a8  = 0x00300208,
+	/** ARM Cortex-A9. */
+	cpuinfo_uarch_cortex_a9  = 0x00300209,
+	/** ARM Cortex-A12. */
+	cpuinfo_uarch_cortex_a12 = 0x00300212,
+	/** ARM Cortex-A15. */
+	cpuinfo_uarch_cortex_a15 = 0x00300215,
+	/** ARM Cortex-A17. */
+	cpuinfo_uarch_cortex_a17 = 0x00300217,
+
+	/** ARM Cortex-A32. */
+	cpuinfo_uarch_cortex_a32   = 0x00300332,
+	/** ARM Cortex-A35. */
+	cpuinfo_uarch_cortex_a35   = 0x00300335,
+	/** ARM Cortex-A53. */
+	cpuinfo_uarch_cortex_a53   = 0x00300353,
+	/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
+	cpuinfo_uarch_cortex_a55r0 = 0x00300354,
+	/** ARM Cortex-A55. */
+	cpuinfo_uarch_cortex_a55   = 0x00300355,
+	/** ARM Cortex-A57. */
+	cpuinfo_uarch_cortex_a57   = 0x00300357,
+	/** ARM Cortex-A65. */
+	cpuinfo_uarch_cortex_a65   = 0x00300365,
+	/** ARM Cortex-A72. */
+	cpuinfo_uarch_cortex_a72   = 0x00300372,
+	/** ARM Cortex-A73. */
+	cpuinfo_uarch_cortex_a73   = 0x00300373,
+	/** ARM Cortex-A75. */
+	cpuinfo_uarch_cortex_a75   = 0x00300375,
+	/** ARM Cortex-A76. */
+	cpuinfo_uarch_cortex_a76   = 0x00300376,
+	/** ARM Cortex-A77. */
+	cpuinfo_uarch_cortex_a77   = 0x00300377,
+	/** ARM Cortex-A78. */
+	cpuinfo_uarch_cortex_a78   = 0x00300378,
+
+	/** ARM Neoverse N1. */
+	cpuinfo_uarch_neoverse_n1  = 0x00300400,
+	/** ARM Neoverse E1. */
+	cpuinfo_uarch_neoverse_e1  = 0x00300401,
+	/** ARM Neoverse V1. */
+	cpuinfo_uarch_neoverse_v1  = 0x00300402,
+	/** ARM Neoverse N2. */
+	cpuinfo_uarch_neoverse_n2  = 0x00300403,
+	/** ARM Neoverse V2. */
+	cpuinfo_uarch_neoverse_v2  = 0x00300404,
+
+	/** ARM Cortex-X1. */
+	cpuinfo_uarch_cortex_x1    = 0x00300501,
+	/** ARM Cortex-X2. */
+	cpuinfo_uarch_cortex_x2    = 0x00300502,
+	/** ARM Cortex-X3. */
+	cpuinfo_uarch_cortex_x3    = 0x00300503,
+
+	/** ARM Cortex-A510. */
+	cpuinfo_uarch_cortex_a510  = 0x00300551,
+	/** ARM Cortex-A710. */
+	cpuinfo_uarch_cortex_a710  = 0x00300571,
+	/** ARM Cortex-A715. */
+	cpuinfo_uarch_cortex_a715  = 0x00300572,
+
+	/** Qualcomm Scorpion. */
+	cpuinfo_uarch_scorpion = 0x00400100,
+	/** Qualcomm Krait. */
+	cpuinfo_uarch_krait    = 0x00400101,
+	/** Qualcomm Kryo. */
+	cpuinfo_uarch_kryo     = 0x00400102,
+	/** Qualcomm Falkor. */
+	cpuinfo_uarch_falkor   = 0x00400103,
+	/** Qualcomm Saphira. */
+	cpuinfo_uarch_saphira  = 0x00400104,
+
+	/** Nvidia Denver. */
+	cpuinfo_uarch_denver   = 0x00500100,
+	/** Nvidia Denver 2. */
+	cpuinfo_uarch_denver2  = 0x00500101,
+	/** Nvidia Carmel. */
+	cpuinfo_uarch_carmel   = 0x00500102,
+
+	/** Samsung Exynos M1 (Exynos 8890 big cores). */
+	cpuinfo_uarch_exynos_m1 = 0x00600100,
+	/** Samsung Exynos M2 (Exynos 8895 big cores). */
+	cpuinfo_uarch_exynos_m2 = 0x00600101,
+	/** Samsung Exynos M3 (Exynos 9810 big cores). */
+	cpuinfo_uarch_exynos_m3  = 0x00600102,
+	/** Samsung Exynos M4 (Exynos 9820 big cores). */
+	cpuinfo_uarch_exynos_m4  = 0x00600103,
+	/** Samsung Exynos M5 (Exynos 9830 big cores). */
+	cpuinfo_uarch_exynos_m5  = 0x00600104,
+
+	/* Deprecated synonym for Cortex-A76 */
+	cpuinfo_uarch_cortex_a76ae = 0x00300376,
+	/* Deprecated names for Exynos. */
+	cpuinfo_uarch_mongoose_m1 = 0x00600100,
+	cpuinfo_uarch_mongoose_m2 = 0x00600101,
+	cpuinfo_uarch_meerkat_m3  = 0x00600102,
+	cpuinfo_uarch_meerkat_m4  = 0x00600103,
+
+	/** Apple A6 and A6X processors. */
+	cpuinfo_uarch_swift     = 0x00700100,
+	/** Apple A7 processor. */
+	cpuinfo_uarch_cyclone   = 0x00700101,
+	/** Apple A8 and A8X processor. */
+	cpuinfo_uarch_typhoon   = 0x00700102,
+	/** Apple A9 and A9X processor. */
+	cpuinfo_uarch_twister   = 0x00700103,
+	/** Apple A10 and A10X processor. */
+	cpuinfo_uarch_hurricane = 0x00700104,
+	/** Apple A11 processor (big cores). */
+	cpuinfo_uarch_monsoon   = 0x00700105,
+	/** Apple A11 processor (little cores). */
+	cpuinfo_uarch_mistral   = 0x00700106,
+	/** Apple A12 processor (big cores). */
+	cpuinfo_uarch_vortex    = 0x00700107,
+	/** Apple A12 processor (little cores). */
+	cpuinfo_uarch_tempest   = 0x00700108,
+	/** Apple A13 processor (big cores). */
+	cpuinfo_uarch_lightning = 0x00700109,
+	/** Apple A13 processor (little cores). */
+	cpuinfo_uarch_thunder   = 0x0070010A,
+	/** Apple A14 / M1 processor (big cores). */
+	cpuinfo_uarch_firestorm = 0x0070010B,
+	/** Apple A14 / M1 processor (little cores). */
+	cpuinfo_uarch_icestorm  = 0x0070010C,
+	/** Apple A15 / M2 processor (big cores). */
+	cpuinfo_uarch_avalanche = 0x0070010D,
+	/** Apple A15 / M2 processor (little cores). */
+	cpuinfo_uarch_blizzard  = 0x0070010E,
+
+	/** Cavium ThunderX. */
+	cpuinfo_uarch_thunderx = 0x00800100,
+	/** Cavium ThunderX2 (originally Broadcom Vulkan). */
+	cpuinfo_uarch_thunderx2 = 0x00800200,
+
+	/** Marvell PJ4. */
+	cpuinfo_uarch_pj4 = 0x00900100,
+
+	/** Broadcom Brahma B15. */
+	cpuinfo_uarch_brahma_b15 = 0x00A00100,
+	/** Broadcom Brahma B53. */
+	cpuinfo_uarch_brahma_b53 = 0x00A00101,
+
+	/** Applied Micro X-Gene. */
+	cpuinfo_uarch_xgene = 0x00B00100,
+
+	/* Hygon Dhyana (a modification of AMD Zen for Chinese market). */
+	cpuinfo_uarch_dhyana = 0x01000100,
+
+	/** HiSilicon TaiShan v110 (Huawei Kunpeng 920 series processors). */
+	cpuinfo_uarch_taishan_v110 = 0x00C00100,
+};
+
+struct cpuinfo_processor {
+	/** SMT (hyperthread) ID within a core */
+	uint32_t smt_id;
+	/** Core containing this logical processor */
+	const struct cpuinfo_core* core;
+	/** Cluster of cores containing this logical processor */
+	const struct cpuinfo_cluster* cluster;
+	/** Physical package containing this logical processor */
+	const struct cpuinfo_package* package;
+#if defined(__linux__)
+	/**
+	 * Linux-specific ID for the logical processor:
+	 * - Linux kernel exposes information about this logical processor in /sys/devices/system/cpu/cpu<linux_id>/
+	 * - Bit <linux_id> in the cpu_set_t identifies this logical processor
+	 */
+	int linux_id;
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+	/** Windows-specific ID for the group containing the logical processor. */
+	uint16_t windows_group_id;
+	/**
+	 * Windows-specific ID of the logical processor within its group:
+	 * - Bit <windows_processor_id> in the KAFFINITY mask identifies this logical processor within its group.
+	 */
+	uint16_t windows_processor_id;
+#endif
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** APIC ID (unique x86-specific ID of the logical processor) */
+	uint32_t apic_id;
+#endif
+	struct {
+		/** Level 1 instruction cache */
+		const struct cpuinfo_cache* l1i;
+		/** Level 1 data cache */
+		const struct cpuinfo_cache* l1d;
+		/** Level 2 unified or data cache */
+		const struct cpuinfo_cache* l2;
+		/** Level 3 unified or data cache */
+		const struct cpuinfo_cache* l3;
+		/** Level 4 unified or data cache */
+		const struct cpuinfo_cache* l4;
+	} cache;
+};
+
+struct cpuinfo_core {
+	/** Index of the first logical processor on this core. */
+	uint32_t processor_start;
+	/** Number of logical processors on this core */
+	uint32_t processor_count;
+	/** Core ID within a package */
+	uint32_t core_id;
+	/** Cluster containing this core */
+	const struct cpuinfo_cluster* cluster;
+	/** Physical package containing this core. */
+	const struct cpuinfo_package* package;
+	/** Vendor of the CPU microarchitecture for this core */
+	enum cpuinfo_vendor vendor;
+	/** CPU microarchitecture for this core */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register for this core */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) for this core */
+	uint32_t midr;
+#endif
+	/** Clock rate (non-Turbo) of the core, in Hz */
+	uint64_t frequency;
+};
+
+struct cpuinfo_cluster {
+	/** Index of the first logical processor in the cluster */
+	uint32_t processor_start;
+	/** Number of logical processors in the cluster */
+	uint32_t processor_count;
+	/** Index of the first core in the cluster */
+	uint32_t core_start;
+	/** Number of cores on the cluster */
+	uint32_t core_count;
+	/** Cluster ID within a package */
+	uint32_t cluster_id;
+	/** Physical package containing the cluster */
+	const struct cpuinfo_package* package;
+	/** CPU microarchitecture vendor of the cores in the cluster */
+	enum cpuinfo_vendor vendor;
+	/** CPU microarchitecture of the cores in the cluster */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register of the cores in the cluster */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) of the cores in the cluster */
+	uint32_t midr;
+#endif
+	/** Clock rate (non-Turbo) of the cores in the cluster, in Hz */
+	uint64_t frequency;
+};
+
+#define CPUINFO_PACKAGE_NAME_MAX 48
+
+struct cpuinfo_package {
+	/** SoC or processor chip model name */
+	char name[CPUINFO_PACKAGE_NAME_MAX];
+	/** Index of the first logical processor on this physical package */
+	uint32_t processor_start;
+	/** Number of logical processors on this physical package */
+	uint32_t processor_count;
+	/** Index of the first core on this physical package */
+	uint32_t core_start;
+	/** Number of cores on this physical package */
+	uint32_t core_count;
+	/** Index of the first cluster of cores on this physical package */
+	uint32_t cluster_start;
+	/** Number of clusters of cores on this physical package */
+	uint32_t cluster_count;
+};
+
+struct cpuinfo_uarch_info {
+	/** Type of CPU microarchitecture */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register for the microarchitecture */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) for the microarchitecture */
+	uint32_t midr;
+#endif
+	/** Number of logical processors with the microarchitecture */
+	uint32_t processor_count;
+	/** Number of cores with the microarchitecture */
+	uint32_t core_count;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool CPUINFO_ABI cpuinfo_initialize(void);
+
+void CPUINFO_ABI cpuinfo_deinitialize(void);
+
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/* This structure is not a part of stable API. Use cpuinfo_has_x86_* functions instead. */
+	struct cpuinfo_x86_isa {
+		#if CPUINFO_ARCH_X86
+			bool rdtsc;
+		#endif
+		bool rdtscp;
+		bool rdpid;
+		bool sysenter;
+		#if CPUINFO_ARCH_X86
+			bool syscall;
+		#endif
+		bool msr;
+		bool clzero;
+		bool clflush;
+		bool clflushopt;
+		bool mwait;
+		bool mwaitx;
+		#if CPUINFO_ARCH_X86
+			bool emmx;
+		#endif
+		bool fxsave;
+		bool xsave;
+		#if CPUINFO_ARCH_X86
+			bool fpu;
+			bool mmx;
+			bool mmx_plus;
+		#endif
+		bool three_d_now;
+		bool three_d_now_plus;
+		#if CPUINFO_ARCH_X86
+			bool three_d_now_geode;
+		#endif
+		bool prefetch;
+		bool prefetchw;
+		bool prefetchwt1;
+		#if CPUINFO_ARCH_X86
+			bool daz;
+			bool sse;
+			bool sse2;
+		#endif
+		bool sse3;
+		bool ssse3;
+		bool sse4_1;
+		bool sse4_2;
+		bool sse4a;
+		bool misaligned_sse;
+		bool avx;
+		bool avxvnni;
+		bool fma3;
+		bool fma4;
+		bool xop;
+		bool f16c;
+		bool avx2;
+		bool avx512f;
+		bool avx512pf;
+		bool avx512er;
+		bool avx512cd;
+		bool avx512dq;
+		bool avx512bw;
+		bool avx512vl;
+		bool avx512ifma;
+		bool avx512vbmi;
+		bool avx512vbmi2;
+		bool avx512bitalg;
+		bool avx512vpopcntdq;
+		bool avx512vnni;
+		bool avx512bf16;
+		bool avx512fp16;
+		bool avx512vp2intersect;
+		bool avx512_4vnniw;
+		bool avx512_4fmaps;
+		bool hle;
+		bool rtm;
+		bool xtest;
+		bool mpx;
+		#if CPUINFO_ARCH_X86
+			bool cmov;
+			bool cmpxchg8b;
+		#endif
+		bool cmpxchg16b;
+		bool clwb;
+		bool movbe;
+		#if CPUINFO_ARCH_X86_64
+			bool lahf_sahf;
+		#endif
+		bool fs_gs_base;
+		bool lzcnt;
+		bool popcnt;
+		bool tbm;
+		bool bmi;
+		bool bmi2;
+		bool adx;
+		bool aes;
+		bool vaes;
+		bool pclmulqdq;
+		bool vpclmulqdq;
+		bool gfni;
+		bool rdrand;
+		bool rdseed;
+		bool sha;
+		bool rng;
+		bool ace;
+		bool ace2;
+		bool phe;
+		bool pmm;
+		bool lwp;
+	};
+
+	extern struct cpuinfo_x86_isa cpuinfo_isa;
+#endif
+
+static inline bool cpuinfo_has_x86_rdtsc(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.rdtsc;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdtscp(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdtscp;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdpid(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdpid;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_clzero(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.clzero;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mwait(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mwait;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mwaitx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mwaitx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fxsave(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fxsave;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xsave(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xsave;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fpu(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.fpu;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mmx(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.mmx;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mmx_plus(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.mmx_plus;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.three_d_now;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow_plus(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.three_d_now_plus;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow_geode(void) {
+	#if CPUINFO_ARCH_X86_64
+		return false;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return false;
+		#else
+			return cpuinfo_isa.three_d_now_geode;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetch(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetch;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetchw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetchw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetchwt1(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetchwt1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_daz(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.daz;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse2(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse2;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse3;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_ssse3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.ssse3;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4_1(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse4_1;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.sse4_1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4_2(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse4_2;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.sse4_2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4a(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.sse4a;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_misaligned_sse(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.misaligned_sse;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avxvnni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avxvnni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fma3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fma3;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fma4(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fma4;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xop(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xop;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_f16c(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.f16c;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512f(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512f;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512pf(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512pf;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512er(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512er;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512cd(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512cd;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512dq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512dq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vl(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vl;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512ifma(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512ifma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vbmi(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vbmi;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vbmi2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vbmi2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bitalg(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bitalg;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vpopcntdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vpopcntdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vnni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vnni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bf16(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512fp16(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vp2intersect(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vp2intersect;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512_4vnniw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512_4vnniw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512_4fmaps(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512_4fmaps;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_hle(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.hle;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rtm(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rtm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xtest(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xtest;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mpx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mpx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmov(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.cmov;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmpxchg8b(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.cmpxchg8b;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmpxchg16b(void) {
+	#if CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.cmpxchg16b;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_clwb(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.clwb;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_movbe(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.movbe;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_lahf_sahf(void) {
+	#if CPUINFO_ARCH_X86
+		return true;
+	#elif CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.lahf_sahf;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_lzcnt(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.lzcnt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_popcnt(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.popcnt;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.popcnt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_tbm(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.tbm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_bmi(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.bmi;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_bmi2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.bmi2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_adx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.adx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_aes(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.aes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_vaes(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.vaes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_pclmulqdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.pclmulqdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_vpclmulqdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.vpclmulqdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_gfni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.gfni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdrand(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdrand;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdseed(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdseed;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sha(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.sha;
+	#else
+		return false;
+	#endif
+}
+
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/* This structure is not a part of stable API. Use cpuinfo_has_arm_* functions instead. */
+	struct cpuinfo_arm_isa {
+		#if CPUINFO_ARCH_ARM
+			bool thumb;
+			bool thumb2;
+			bool thumbee;
+			bool jazelle;
+			bool armv5e;
+			bool armv6;
+			bool armv6k;
+			bool armv7;
+			bool armv7mp;
+			bool armv8;
+			bool idiv;
+
+			bool vfpv2;
+			bool vfpv3;
+			bool d32;
+			bool fp16;
+			bool fma;
+
+			bool wmmx;
+			bool wmmx2;
+			bool neon;
+		#endif
+		#if CPUINFO_ARCH_ARM64
+			bool atomics;
+			bool bf16;
+			bool sve;
+			bool sve2;
+			bool i8mm;
+		#endif
+		bool rdm;
+		bool fp16arith;
+		bool dot;
+		bool jscvt;
+		bool fcma;
+		bool fhm;
+
+		bool aes;
+		bool sha1;
+		bool sha2;
+		bool pmull;
+		bool crc32;
+	};
+
+	extern struct cpuinfo_arm_isa cpuinfo_isa;
+#endif
+
+static inline bool cpuinfo_has_arm_thumb(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.thumb;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_thumb2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.thumb2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v5e(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv5e;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v6(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv6;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v6k(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv6k;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v7(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv7;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v7mp(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv7mp;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v8(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv8;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_idiv(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.idiv;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_fp16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_fp16_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16 && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv4(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv4_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fp16_arith(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fp16arith;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_wmmx(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.wmmx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_wmmx2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.wmmx2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fp16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fma(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_v8(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.armv8;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_atomics(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.atomics;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_rdm(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.rdm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fp16_arith(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fp16arith;
+	#elif CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fp16arith;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fhm(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fhm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_dot(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.dot;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_jscvt(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.jscvt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fcma(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fcma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_i8mm(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.i8mm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_aes(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.aes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sha1(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sha1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sha2(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sha2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_pmull(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.pmull;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_crc32(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.crc32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve && cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve2(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve2;
+	#else
+		return false;
+	#endif
+}
+
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processors(void);
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_cores(void);
+const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_clusters(void);
+const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_packages(void);
+const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarchs(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_caches(void);
+
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processor(uint32_t index);
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_core(uint32_t index);
+const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_cluster(uint32_t index);
+const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_package(uint32_t index);
+const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarch(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_cache(uint32_t index);
+
+uint32_t CPUINFO_ABI cpuinfo_get_processors_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_cores_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_clusters_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_packages_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_uarchs_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l1i_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l1d_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l2_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l3_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l4_caches_count(void);
+
+/**
+ * Returns upper bound on cache size.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_max_cache_size(void);
+
+/**
+ * Identify the logical processor that executes the current thread.
+ *
+ * There is no guarantee that the thread will stay on the same logical processor for any time.
+ * Callers should treat the result as only a hint, and be prepared to handle NULL return value.
+ */
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_current_processor(void);
+
+/**
+ * Identify the core that executes the current thread.
+ *
+ * There is no guarantee that the thread will stay on the same core for any time.
+ * Callers should treat the result as only a hint, and be prepared to handle NULL return value.
+ */
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_current_core(void);
+
+/**
+ * Identify the microarchitecture index of the core that executes the current thread.
+ * If the system does not support such identification, the function returns 0.
+ *
+ * There is no guarantee that the thread will stay on the same type of core for any time.
+ * Callers should treat the result as only a hint.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index(void);
+
+/**
+ * Identify the microarchitecture index of the core that executes the current thread.
+ * If the system does not support such identification, the function returns the user-specified default value.
+ *
+ * There is no guarantee that the thread will stay on the same type of core for any time.
+ * Callers should treat the result as only a hint.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index_with_default(uint32_t default_uarch_index);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* CPUINFO_H */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad731150b28babe7bd5a911acd8de70c57e85254
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_ocl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_OCL_H
+#define DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#endif /* DNNL_OCL_H */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a3d5cf839b1d593f069520febfd60b323730e9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/dnnl_version.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_VERSION_H
+#define DNNL_VERSION_H
+
+#include "oneapi/dnnl/dnnl_version.h"
+
+#endif /* DNNL_VERSION_H */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h
new file mode 100644
index 0000000000000000000000000000000000000000..de36514f991a5f9b4774b232a1a6350c47c2c74c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/sleef.h
@@ -0,0 +1,4459 @@
+//   Copyright Naoki Shibata and contributors 2010 - 2020.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __SLEEF_H__
+#define __SLEEF_H__
+
+#define SLEEF_VERSION_MAJOR 3
+#define SLEEF_VERSION_MINOR 6
+#define SLEEF_VERSION_PATCHLEVEL 0
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define CONST const
+#else
+#define CONST
+#endif
+
+#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__)
+#ifndef FP_FAST_FMA
+#define FP_FAST_FMA
+#endif
+#ifndef FP_FAST_FMAF
+#define FP_FAST_FMAF
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__STDC__)
+#define __STDC__ 1
+#endif
+
+#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#ifdef IMPORT_IS_EXPORT
+#define IMPORT __declspec(dllexport)
+#else // #ifdef IMPORT_IS_EXPORT
+#define IMPORT __declspec(dllimport)
+#if (defined(_MSC_VER))
+#pragma comment(lib,"sleef.lib")
+#endif // #if (defined(_MSC_VER))
+#endif // #ifdef IMPORT_IS_EXPORT
+#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+#define IMPORT
+#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
+#include <x86intrin.h>
+#endif
+
+#if (defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#endif
+
+#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#include <altivec.h>
+typedef __vector double       SLEEF_VECTOR_DOUBLE;
+typedef __vector float        SLEEF_VECTOR_FLOAT;
+typedef __vector int          SLEEF_VECTOR_INT;
+typedef __vector unsigned int SLEEF_VECTOR_UINT;
+typedef __vector long long SLEEF_VECTOR_LONGLONG;
+typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG;
+#endif
+
+#if defined(__VX__) && defined(__VEC__)
+#ifndef SLEEF_VECINTRIN_H_INCLUDED
+#include <vecintrin.h>
+#define SLEEF_VECINTRIN_H_INCLUDED
+#endif
+typedef __vector double       SLEEF_VECTOR_DOUBLE;
+typedef __vector float        SLEEF_VECTOR_FLOAT;
+typedef __vector int          SLEEF_VECTOR_INT;
+typedef __vector unsigned int SLEEF_VECTOR_UINT;
+typedef __vector long long SLEEF_VECTOR_LONGLONG;
+typedef __vector unsigned long long SLEEF_VECTOR_ULONGLONG;
+#endif
+
+//
+
+#ifndef SLEEF_FP_ILOGB0
+#define SLEEF_FP_ILOGB0 ((int)-2147483648)
+#endif
+
+#ifndef SLEEF_FP_ILOGBNAN
+#define SLEEF_FP_ILOGBNAN ((int)2147483647)
+#endif
+
+//
+
+IMPORT void *Sleef_malloc(size_t z);
+IMPORT void Sleef_free(void *ptr);
+IMPORT uint64_t Sleef_currentTimeMicros();
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+IMPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
+#endif
+
+//
+
+#ifndef Sleef_double2_DEFINED
+#define Sleef_double2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double2;
+#endif
+
+#ifndef Sleef_float2_DEFINED
+#define Sleef_float2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float2;
+#endif
+
+#ifndef Sleef_longdouble2_DEFINED
+#define Sleef_longdouble2_DEFINED
+typedef struct {
+  long double x, y;
+} Sleef_longdouble2;
+#endif
+
+#if !defined(Sleef_quad_DEFINED)
+#define Sleef_quad_DEFINED
+#if defined(__SIZEOF_FLOAT128__) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
+typedef __float128 Sleef_quad;
+#define SLEEF_QUAD_C(x) (x ## Q)
+//#elif defined(__SIZEOF_LONG_DOUBLE__) && defined(__aarch64__)
+//typedef long double Sleef_quad;
+//#define SLEEF_QUAD_C(x) (x ## L)
+#else
+typedef struct { uint64_t x, y; } Sleef_quad;
+#endif
+#endif
+
+#if !defined(Sleef_quad2_DEFINED)
+#define Sleef_quad2_DEFINED
+typedef union {
+  struct {
+    Sleef_quad x, y;
+  };
+  Sleef_quad s[2];
+} Sleef_quad2;
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+IMPORT CONST double Sleef_sin_u35(double);
+IMPORT CONST double Sleef_cos_u35(double);
+IMPORT CONST Sleef_double2 Sleef_sincos_u35(double);
+IMPORT CONST double Sleef_tan_u35(double);
+IMPORT CONST double Sleef_asin_u35(double);
+IMPORT CONST double Sleef_acos_u35(double);
+IMPORT CONST double Sleef_atan_u35(double);
+IMPORT CONST double Sleef_atan2_u35(double, double);
+IMPORT CONST double Sleef_log_u35(double);
+IMPORT CONST double Sleef_cbrt_u35(double);
+IMPORT CONST double Sleef_sin_u10(double);
+IMPORT CONST double Sleef_cos_u10(double);
+IMPORT CONST Sleef_double2 Sleef_sincos_u10(double);
+IMPORT CONST double Sleef_tan_u10(double);
+IMPORT CONST double Sleef_asin_u10(double);
+IMPORT CONST double Sleef_acos_u10(double);
+IMPORT CONST double Sleef_atan_u10(double);
+IMPORT CONST double Sleef_atan2_u10(double, double);
+IMPORT CONST double Sleef_log_u10(double);
+IMPORT CONST double Sleef_cbrt_u10(double);
+IMPORT CONST double Sleef_exp_u10(double);
+IMPORT CONST double Sleef_pow_u10(double, double);
+IMPORT CONST double Sleef_sinh_u10(double);
+IMPORT CONST double Sleef_cosh_u10(double);
+IMPORT CONST double Sleef_tanh_u10(double);
+IMPORT CONST double Sleef_sinh_u35(double);
+IMPORT CONST double Sleef_cosh_u35(double);
+IMPORT CONST double Sleef_tanh_u35(double);
+IMPORT CONST double Sleef_asinh_u10(double);
+IMPORT CONST double Sleef_acosh_u10(double);
+IMPORT CONST double Sleef_atanh_u10(double);
+IMPORT CONST double Sleef_exp2_u10(double);
+IMPORT CONST double Sleef_exp10_u10(double);
+IMPORT CONST double Sleef_exp2_u35(double);
+IMPORT CONST double Sleef_exp10_u35(double);
+IMPORT CONST double Sleef_expm1_u10(double);
+IMPORT CONST double Sleef_log10_u10(double);
+IMPORT CONST double Sleef_log2_u10(double);
+IMPORT CONST double Sleef_log2_u35(double);
+IMPORT CONST double Sleef_log1p_u10(double);
+IMPORT CONST Sleef_double2 Sleef_sincospi_u05(double);
+IMPORT CONST Sleef_double2 Sleef_sincospi_u35(double);
+IMPORT CONST double Sleef_sinpi_u05(double);
+IMPORT CONST double Sleef_cospi_u05(double);
+IMPORT CONST double Sleef_ldexp(double, int);
+IMPORT CONST int Sleef_ilogb(double);
+IMPORT CONST double Sleef_fma(double, double, double);
+IMPORT CONST double Sleef_sqrt(double);
+IMPORT CONST double Sleef_sqrt_u05(double);
+IMPORT CONST double Sleef_sqrt_u35(double);
+
+IMPORT CONST double Sleef_hypot_u05(double, double);
+IMPORT CONST double Sleef_hypot_u35(double, double);
+
+IMPORT CONST double Sleef_fabs(double);
+IMPORT CONST double Sleef_copysign(double, double);
+IMPORT CONST double Sleef_fmax(double, double);
+IMPORT CONST double Sleef_fmin(double, double);
+IMPORT CONST double Sleef_fdim(double, double);
+IMPORT CONST double Sleef_trunc(double);
+IMPORT CONST double Sleef_floor(double);
+IMPORT CONST double Sleef_ceil(double);
+IMPORT CONST double Sleef_round(double);
+IMPORT CONST double Sleef_rint(double);
+IMPORT CONST double Sleef_nextafter(double, double);
+IMPORT CONST double Sleef_frfrexp(double);
+IMPORT CONST int Sleef_expfrexp(double);
+IMPORT CONST double Sleef_fmod(double, double);
+IMPORT CONST double Sleef_remainder(double, double);
+IMPORT CONST Sleef_double2 Sleef_modf(double);
+
+IMPORT CONST double Sleef_lgamma_u10(double);
+IMPORT CONST double Sleef_tgamma_u10(double);
+IMPORT CONST double Sleef_erf_u10(double);
+IMPORT CONST double Sleef_erfc_u15(double);
+
+IMPORT CONST float Sleef_sinf_u35(float);
+IMPORT CONST float Sleef_cosf_u35(float);
+IMPORT CONST Sleef_float2 Sleef_sincosf_u35(float);
+IMPORT CONST float Sleef_tanf_u35(float);
+IMPORT CONST float Sleef_asinf_u35(float);
+IMPORT CONST float Sleef_acosf_u35(float);
+IMPORT CONST float Sleef_atanf_u35(float);
+IMPORT CONST float Sleef_atan2f_u35(float, float);
+IMPORT CONST float Sleef_logf_u35(float);
+IMPORT CONST float Sleef_cbrtf_u35(float);
+IMPORT CONST float Sleef_sinf_u10(float);
+IMPORT CONST float Sleef_cosf_u10(float);
+IMPORT CONST Sleef_float2 Sleef_sincosf_u10(float);
+IMPORT CONST float Sleef_fastsinf_u3500(float);
+IMPORT CONST float Sleef_fastcosf_u3500(float);
+IMPORT CONST float Sleef_tanf_u10(float);
+IMPORT CONST float Sleef_asinf_u10(float);
+IMPORT CONST float Sleef_acosf_u10(float);
+IMPORT CONST float Sleef_atanf_u10(float);
+IMPORT CONST float Sleef_atan2f_u10(float, float);
+IMPORT CONST float Sleef_logf_u10(float);
+IMPORT CONST float Sleef_cbrtf_u10(float);
+IMPORT CONST float Sleef_expf_u10(float);
+IMPORT CONST float Sleef_powf_u10(float, float);
+IMPORT CONST float Sleef_fastpowf_u3500(float, float);
+IMPORT CONST float Sleef_sinhf_u10(float);
+IMPORT CONST float Sleef_coshf_u10(float);
+IMPORT CONST float Sleef_tanhf_u10(float);
+IMPORT CONST float Sleef_sinhf_u35(float);
+IMPORT CONST float Sleef_coshf_u35(float);
+IMPORT CONST float Sleef_tanhf_u35(float);
+IMPORT CONST float Sleef_asinhf_u10(float);
+IMPORT CONST float Sleef_acoshf_u10(float);
+IMPORT CONST float Sleef_atanhf_u10(float);
+IMPORT CONST float Sleef_exp2f_u10(float);
+IMPORT CONST float Sleef_exp10f_u10(float);
+IMPORT CONST float Sleef_exp2f_u35(float);
+IMPORT CONST float Sleef_exp10f_u35(float);
+IMPORT CONST float Sleef_expm1f_u10(float);
+IMPORT CONST float Sleef_log10f_u10(float);
+IMPORT CONST float Sleef_log2f_u10(float);
+IMPORT CONST float Sleef_log2f_u35(float);
+IMPORT CONST float Sleef_log1pf_u10(float);
+IMPORT CONST Sleef_float2 Sleef_sincospif_u05(float);
+IMPORT CONST Sleef_float2 Sleef_sincospif_u35(float);
+IMPORT CONST float Sleef_sinpif_u05(float d);
+IMPORT CONST float Sleef_cospif_u05(float d);
+IMPORT CONST float Sleef_ldexpf(float, int);
+IMPORT CONST int Sleef_ilogbf(float);
+IMPORT CONST float Sleef_fmaf(float, float, float);
+IMPORT CONST float Sleef_sqrtf(float);
+IMPORT CONST float Sleef_sqrtf_u05(float);
+IMPORT CONST float Sleef_sqrtf_u35(float);
+
+IMPORT CONST float Sleef_hypotf_u05(float, float);
+IMPORT CONST float Sleef_hypotf_u35(float, float);
+
+IMPORT CONST float Sleef_fabsf(float);
+IMPORT CONST float Sleef_copysignf(float, float);
+IMPORT CONST float Sleef_fmaxf(float, float);
+IMPORT CONST float Sleef_fminf(float, float);
+IMPORT CONST float Sleef_fdimf(float, float);
+IMPORT CONST float Sleef_truncf(float);
+IMPORT CONST float Sleef_floorf(float);
+IMPORT CONST float Sleef_ceilf(float);
+IMPORT CONST float Sleef_roundf(float);
+IMPORT CONST float Sleef_rintf(float);
+IMPORT CONST float Sleef_nextafterf(float, float);
+IMPORT CONST float Sleef_frfrexpf(float);
+IMPORT CONST int Sleef_expfrexpf(float);
+IMPORT CONST float Sleef_fmodf(float, float);
+IMPORT CONST float Sleef_remainderf(float, float);
+IMPORT CONST Sleef_float2 Sleef_modff(float);
+
+IMPORT CONST float Sleef_lgammaf_u10(float);
+IMPORT CONST float Sleef_tgammaf_u10(float);
+IMPORT CONST float Sleef_erff_u10(float);
+IMPORT CONST float Sleef_erfcf_u15(float);
+
+IMPORT CONST Sleef_longdouble2 Sleef_sincospil_u05(long double);
+IMPORT CONST Sleef_longdouble2 Sleef_sincospil_u35(long double);
+
+#if defined(Sleef_quad2_DEFINED)
+IMPORT CONST Sleef_quad2 Sleef_sincospiq_u05(Sleef_quad);
+IMPORT CONST Sleef_quad2 Sleef_sincospiq_u35(Sleef_quad);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+IMPORT CONST __m128d Sleef_sind2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u35(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u35(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u35(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u35(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u35(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u35(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_sind2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u10(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u10(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u10(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u10(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u10(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u10(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_expd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_powd2_u10(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_powd2_u10(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_fastsind2_u3500(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500(__m128d);
+IMPORT CONST __m128d Sleef_fastcosd2_u3500(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500(__m128d);
+IMPORT CONST __m128d Sleef_fastpowd2_u3500(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_asinhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asinhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_acoshd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acoshd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_atanhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atanhd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_expm1d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expm1d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_log10d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log10d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u10(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u35(__m128d);
+IMPORT CONST __m128d Sleef_log1pd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log1pd2_u10(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35(__m128d);
+IMPORT CONST __m128d Sleef_sinpid2_u05(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinpid2_u05(__m128d);
+IMPORT CONST __m128d Sleef_cospid2_u05(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cospid2_u05(__m128d);
+IMPORT CONST __m128d Sleef_ldexpd2(__m128d, __m128i);
+IMPORT CONST __m128d Sleef_cinz_ldexpd2(__m128d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd2(__m128d);
+IMPORT CONST __m128i Sleef_cinz_ilogbd2(__m128d);
+IMPORT CONST __m128d Sleef_fmad2(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmad2(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_sqrtd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u05(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35(__m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u05(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u05(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u35(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u35(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fabsd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fabsd2(__m128d);
+IMPORT CONST __m128d Sleef_copysignd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_copysignd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmaxd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmaxd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmind2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmind2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fdimd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fdimd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_truncd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_truncd2(__m128d);
+IMPORT CONST __m128d Sleef_floord2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_floord2(__m128d);
+IMPORT CONST __m128d Sleef_ceild2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_ceild2(__m128d);
+IMPORT CONST __m128d Sleef_roundd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_roundd2(__m128d);
+IMPORT CONST __m128d Sleef_rintd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_rintd2(__m128d);
+IMPORT CONST __m128d Sleef_nextafterd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_nextafterd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_frfrexpd2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_frfrexpd2(__m128d);
+IMPORT CONST __m128i Sleef_expfrexpd2(__m128d);
+IMPORT CONST __m128i Sleef_cinz_expfrexpd2(__m128d);
+IMPORT CONST __m128d Sleef_fmodd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmodd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_remainderd2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_remainderd2(__m128d, __m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_modfd2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2(__m128d);
+IMPORT CONST __m128d Sleef_lgammad2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_lgammad2_u10(__m128d);
+IMPORT CONST __m128d Sleef_tgammad2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tgammad2_u10(__m128d);
+IMPORT CONST __m128d Sleef_erfd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfd2_u10(__m128d);
+IMPORT CONST __m128d Sleef_erfcd2_u15(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfcd2_u15(__m128d);
+IMPORT CONST int Sleef_getIntd2(int);
+IMPORT CONST void *Sleef_getPtrd2(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+IMPORT CONST __m128 Sleef_sinf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u35(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u35(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u35(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u35(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u35(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u35(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u35(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35(__m128);
+IMPORT CONST __m128 Sleef_sinf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u10(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u10(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u10(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u10(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u10(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u10(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u10(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10(__m128);
+IMPORT CONST __m128 Sleef_expf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_expf4_u10(__m128);
+IMPORT CONST __m128 Sleef_powf4_u10(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_powf4_u10(__m128, __m128);
+IMPORT CONST __m128 Sleef_sinhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u10(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_sinhf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u35(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u35(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u35(__m128);
+IMPORT CONST __m128 Sleef_fastsinf4_u3500(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500(__m128);
+IMPORT CONST __m128 Sleef_fastcosf4_u3500(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500(__m128);
+IMPORT CONST __m128 Sleef_fastpowf4_u3500(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500(__m128, __m128);
+IMPORT CONST __m128 Sleef_asinhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_acoshf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_acoshf4_u10(__m128);
+IMPORT CONST __m128 Sleef_atanhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanhf4_u10(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u10(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u35(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u10(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u35(__m128);
+IMPORT CONST __m128 Sleef_expm1f4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_expm1f4_u10(__m128);
+IMPORT CONST __m128 Sleef_log10f4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_log10f4_u10(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u10(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u35(__m128);
+IMPORT CONST __m128 Sleef_log1pf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_log1pf4_u10(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35(__m128);
+IMPORT CONST __m128 Sleef_sinpif4_u05(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinpif4_u05(__m128);
+IMPORT CONST __m128 Sleef_cospif4_u05(__m128);
+IMPORT CONST __m128 Sleef_cinz_cospif4_u05(__m128);
+IMPORT CONST __m128 Sleef_fmaf4(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaf4(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_sqrtf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u05(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u35(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35(__m128);
+IMPORT CONST __m128 Sleef_hypotf4_u05(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u05(__m128, __m128);
+IMPORT CONST __m128 Sleef_hypotf4_u35(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u35(__m128, __m128);
+IMPORT CONST __m128 Sleef_fabsf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_fabsf4(__m128);
+IMPORT CONST __m128 Sleef_copysignf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_copysignf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fmaxf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaxf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fminf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fminf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fdimf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fdimf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_truncf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_truncf4(__m128);
+IMPORT CONST __m128 Sleef_floorf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_floorf4(__m128);
+IMPORT CONST __m128 Sleef_ceilf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_ceilf4(__m128);
+IMPORT CONST __m128 Sleef_roundf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_roundf4(__m128);
+IMPORT CONST __m128 Sleef_rintf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_rintf4(__m128);
+IMPORT CONST __m128 Sleef_nextafterf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_nextafterf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_frfrexpf4(__m128);
+IMPORT CONST __m128 Sleef_cinz_frfrexpf4(__m128);
+IMPORT CONST __m128 Sleef_fmodf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmodf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_remainderf4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_remainderf4(__m128, __m128);
+IMPORT CONST Sleef___m128_2 Sleef_modff4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4(__m128);
+IMPORT CONST __m128 Sleef_lgammaf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10(__m128);
+IMPORT CONST __m128 Sleef_tgammaf4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10(__m128);
+IMPORT CONST __m128 Sleef_erff4_u10(__m128);
+IMPORT CONST __m128 Sleef_cinz_erff4_u10(__m128);
+IMPORT CONST __m128 Sleef_erfcf4_u15(__m128);
+IMPORT CONST __m128 Sleef_cinz_erfcf4_u15(__m128);
+IMPORT CONST int Sleef_getIntf4(int);
+IMPORT CONST int Sleef_cinz_getIntf4(int);
+IMPORT CONST void *Sleef_getPtrf4(int);
+IMPORT CONST void *Sleef_cinz_getPtrf4(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+IMPORT CONST __m128d Sleef_sind2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u35sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u35sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u35sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_sind2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u10sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u10sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u10sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_expd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_powd2_u10sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_powd2_u10sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_fastsind2_u3500sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500sse2(__m128d);
+IMPORT CONST __m128d Sleef_fastcosd2_u3500sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500sse2(__m128d);
+IMPORT CONST __m128d Sleef_fastpowd2_u3500sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_asinhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asinhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_acoshd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acoshd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_atanhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atanhd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_expm1d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expm1d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_log10d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log10d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_log1pd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log1pd2_u10sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_sinpid2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinpid2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_cospid2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cospid2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_ldexpd2_sse2(__m128d, __m128i);
+IMPORT CONST __m128d Sleef_cinz_ldexpd2_sse2(__m128d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd2_sse2(__m128d);
+IMPORT CONST __m128i Sleef_cinz_ilogbd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_fmad2_sse2(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmad2_sse2(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05sse2(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35sse2(__m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u05sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u05sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u35sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u35sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fabsd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fabsd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_copysignd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_copysignd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmaxd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmaxd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmind2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmind2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fdimd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fdimd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_truncd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_truncd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_floord2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_floord2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_ceild2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_ceild2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_roundd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_roundd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_rintd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_rintd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_nextafterd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_nextafterd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_frfrexpd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_frfrexpd2_sse2(__m128d);
+IMPORT CONST __m128i Sleef_expfrexpd2_sse2(__m128d);
+IMPORT CONST __m128i Sleef_cinz_expfrexpd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_fmodd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmodd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_remainderd2_sse2(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_remainderd2_sse2(__m128d, __m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_modfd2_sse2(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse2(__m128d);
+IMPORT CONST __m128d Sleef_lgammad2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_lgammad2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_tgammad2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tgammad2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_erfd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfd2_u10sse2(__m128d);
+IMPORT CONST __m128d Sleef_erfcd2_u15sse2(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfcd2_u15sse2(__m128d);
+IMPORT CONST int Sleef_getIntd2_sse2(int);
+IMPORT CONST void *Sleef_getPtrd2_sse2(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+IMPORT CONST __m128 Sleef_sinf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u35sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u35sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u35sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_sinf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u10sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u10sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u10sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_expf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_expf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_powf4_u10sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_powf4_u10sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_sinhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_sinhf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_fastsinf4_u3500sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500sse2(__m128);
+IMPORT CONST __m128 Sleef_fastcosf4_u3500sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500sse2(__m128);
+IMPORT CONST __m128 Sleef_fastpowf4_u3500sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_asinhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_acoshf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_acoshf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_atanhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanhf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_expm1f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_expm1f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_log10f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_log10f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_log1pf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_log1pf4_u10sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_sinpif4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinpif4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_cospif4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_cospif4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_fmaf4_sse2(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaf4_sse2(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_sqrtf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05sse2(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35sse2(__m128);
+IMPORT CONST __m128 Sleef_hypotf4_u05sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u05sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_hypotf4_u35sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u35sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_fabsf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_fabsf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_copysignf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_copysignf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_fmaxf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaxf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_fminf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fminf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_fdimf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fdimf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_truncf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_truncf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_floorf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_floorf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_ceilf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_ceilf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_roundf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_roundf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_rintf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_rintf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_nextafterf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_nextafterf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_frfrexpf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_frfrexpf4_sse2(__m128);
+IMPORT CONST __m128 Sleef_fmodf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmodf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_remainderf4_sse2(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_remainderf4_sse2(__m128, __m128);
+IMPORT CONST Sleef___m128_2 Sleef_modff4_sse2(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4_sse2(__m128);
+IMPORT CONST __m128 Sleef_lgammaf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_tgammaf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_erff4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_erff4_u10sse2(__m128);
+IMPORT CONST __m128 Sleef_erfcf4_u15sse2(__m128);
+IMPORT CONST __m128 Sleef_cinz_erfcf4_u15sse2(__m128);
+IMPORT CONST int Sleef_getIntf4_sse2(int);
+IMPORT CONST int Sleef_cinz_getIntf4_sse2(int);
+IMPORT CONST void *Sleef_getPtrf4_sse2(int);
+IMPORT CONST void *Sleef_cinz_getPtrf4_sse2(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+IMPORT CONST __m128d Sleef_sind2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u35sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u35sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u35sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_sind2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sind2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cosd2_u10sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincosd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tand2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asind2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acosd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atand2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u10sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_atan2d2_u10sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_logd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cbrtd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_expd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_powd2_u10sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_powd2_u10sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinhd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_coshd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tanhd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_fastsind2_u3500sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastsind2_u3500sse4(__m128d);
+IMPORT CONST __m128d Sleef_fastcosd2_u3500sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fastcosd2_u3500sse4(__m128d);
+IMPORT CONST __m128d Sleef_fastpowd2_u3500sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fastpowd2_u3500sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_asinhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_asinhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_acoshd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_acoshd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_atanhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_atanhd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp2d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_exp10d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_expm1d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_expm1d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_log10d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log10d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log2d2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_log1pd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_log1pd2_u10sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u05sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_sincospid2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_sinpid2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sinpid2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_cospid2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_cospid2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_ldexpd2_sse4(__m128d, __m128i);
+IMPORT CONST __m128d Sleef_cinz_ldexpd2_sse4(__m128d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd2_sse4(__m128d);
+IMPORT CONST __m128i Sleef_cinz_ilogbd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_fmad2_sse4(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmad2_sse4(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u05sse4(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_sqrtd2_u35sse4(__m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u05sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u05sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u35sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_hypotd2_u35sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fabsd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_fabsd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_copysignd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_copysignd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmaxd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmaxd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmind2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmind2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fdimd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fdimd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_truncd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_truncd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_floord2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_floord2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_ceild2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_ceild2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_roundd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_roundd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_rintd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_rintd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_nextafterd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_nextafterd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_frfrexpd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_frfrexpd2_sse4(__m128d);
+IMPORT CONST __m128i Sleef_expfrexpd2_sse4(__m128d);
+IMPORT CONST __m128i Sleef_cinz_expfrexpd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_fmodd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_fmodd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_remainderd2_sse4(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_cinz_remainderd2_sse4(__m128d, __m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_modfd2_sse4(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_cinz_modfd2_sse4(__m128d);
+IMPORT CONST __m128d Sleef_lgammad2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_lgammad2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_tgammad2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_tgammad2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_erfd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfd2_u10sse4(__m128d);
+IMPORT CONST __m128d Sleef_erfcd2_u15sse4(__m128d);
+IMPORT CONST __m128d Sleef_cinz_erfcd2_u15sse4(__m128d);
+IMPORT CONST int Sleef_getIntd2_sse4(int);
+IMPORT CONST void *Sleef_getPtrd2_sse4(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+IMPORT CONST __m128 Sleef_sinf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u35sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u35sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u35sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_sinf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_cosf4_u10sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincosf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_acosf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u10sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_atan2f4_u10sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_logf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_cbrtf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_expf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_expf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_powf4_u10sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_powf4_u10sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_sinhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_sinhf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinhf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_coshf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_tanhf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_fastsinf4_u3500sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastsinf4_u3500sse4(__m128);
+IMPORT CONST __m128 Sleef_fastcosf4_u3500sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_fastcosf4_u3500sse4(__m128);
+IMPORT CONST __m128 Sleef_fastpowf4_u3500sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fastpowf4_u3500sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_asinhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_asinhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_acoshf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_acoshf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_atanhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_atanhf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp2f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_exp10f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_expm1f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_expm1f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_log10f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_log10f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_log2f4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_log1pf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_log1pf4_u10sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u05sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_sincospif4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_sinpif4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sinpif4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_cospif4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_cospif4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_fmaf4_sse4(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaf4_sse4(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_sqrtf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u05sse4(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_sqrtf4_u35sse4(__m128);
+IMPORT CONST __m128 Sleef_hypotf4_u05sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u05sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_hypotf4_u35sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_hypotf4_u35sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fabsf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_fabsf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_copysignf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_copysignf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fmaxf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmaxf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fminf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fminf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_fdimf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fdimf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_truncf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_truncf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_floorf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_floorf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_ceilf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_ceilf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_roundf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_roundf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_rintf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_rintf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_nextafterf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_nextafterf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_frfrexpf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_frfrexpf4_sse4(__m128);
+IMPORT CONST __m128 Sleef_fmodf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_fmodf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_remainderf4_sse4(__m128, __m128);
+IMPORT CONST __m128 Sleef_cinz_remainderf4_sse4(__m128, __m128);
+IMPORT CONST Sleef___m128_2 Sleef_modff4_sse4(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_cinz_modff4_sse4(__m128);
+IMPORT CONST __m128 Sleef_lgammaf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_lgammaf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_tgammaf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_tgammaf4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_erff4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_erff4_u10sse4(__m128);
+IMPORT CONST __m128 Sleef_erfcf4_u15sse4(__m128);
+IMPORT CONST __m128 Sleef_cinz_erfcf4_u15sse4(__m128);
+IMPORT CONST int Sleef_getIntf4_sse4(int);
+IMPORT CONST int Sleef_cinz_getIntf4_sse4(int);
+IMPORT CONST void *Sleef_getPtrf4_sse4(int);
+IMPORT CONST void *Sleef_cinz_getPtrf4_sse4(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+IMPORT CONST __m256d Sleef_sind4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sind4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cosd4_u35(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tand4_u35(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asind4_u35(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acosd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atand4_u35(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u35(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_atan2d4_u35(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_logd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cbrtd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_sind4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sind4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cosd4_u10(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tand4_u10(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asind4_u10(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acosd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atand4_u10(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u10(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_atan2d4_u10(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_logd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cbrtd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_expd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_expd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_powd4_u10(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_powd4_u10(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_coshd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tanhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinhd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_coshd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tanhd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_fastsind4_u3500(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fastsind4_u3500(__m256d);
+IMPORT CONST __m256d Sleef_fastcosd4_u3500(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fastcosd4_u3500(__m256d);
+IMPORT CONST __m256d Sleef_fastpowd4_u3500(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fastpowd4_u3500(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_asinhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asinhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_acoshd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acoshd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_atanhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atanhd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp2d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp2d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp10d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp10d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_expm1d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_expm1d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_log10d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log10d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log2d4_u10(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log2d4_u35(__m256d);
+IMPORT CONST __m256d Sleef_log1pd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log1pd4_u10(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35(__m256d);
+IMPORT CONST __m256d Sleef_sinpid4_u05(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinpid4_u05(__m256d);
+IMPORT CONST __m256d Sleef_cospid4_u05(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cospid4_u05(__m256d);
+IMPORT CONST __m256d Sleef_ldexpd4(__m256d, __m128i);
+IMPORT CONST __m256d Sleef_cinz_ldexpd4(__m256d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd4(__m256d);
+IMPORT CONST __m128i Sleef_cinz_ilogbd4(__m256d);
+IMPORT CONST __m256d Sleef_fmad4(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmad4(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_sqrtd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u05(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4_u05(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4_u35(__m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u05(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_hypotd4_u05(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u35(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_hypotd4_u35(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fabsd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fabsd4(__m256d);
+IMPORT CONST __m256d Sleef_copysignd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_copysignd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmaxd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmaxd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmind4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmind4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fdimd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fdimd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_truncd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_truncd4(__m256d);
+IMPORT CONST __m256d Sleef_floord4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_floord4(__m256d);
+IMPORT CONST __m256d Sleef_ceild4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_ceild4(__m256d);
+IMPORT CONST __m256d Sleef_roundd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_roundd4(__m256d);
+IMPORT CONST __m256d Sleef_rintd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_rintd4(__m256d);
+IMPORT CONST __m256d Sleef_nextafterd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_nextafterd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_frfrexpd4(__m256d);
+IMPORT CONST __m256d Sleef_cinz_frfrexpd4(__m256d);
+IMPORT CONST __m128i Sleef_expfrexpd4(__m256d);
+IMPORT CONST __m128i Sleef_cinz_expfrexpd4(__m256d);
+IMPORT CONST __m256d Sleef_fmodd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmodd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_remainderd4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_remainderd4(__m256d, __m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_modfd4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_modfd4(__m256d);
+IMPORT CONST __m256d Sleef_lgammad4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_lgammad4_u10(__m256d);
+IMPORT CONST __m256d Sleef_tgammad4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tgammad4_u10(__m256d);
+IMPORT CONST __m256d Sleef_erfd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_cinz_erfd4_u10(__m256d);
+IMPORT CONST __m256d Sleef_erfcd4_u15(__m256d);
+IMPORT CONST __m256d Sleef_cinz_erfcd4_u15(__m256d);
+IMPORT CONST int Sleef_getIntd4(int);
+IMPORT CONST void *Sleef_getPtrd4(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+IMPORT CONST __m256 Sleef_sinf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_cosf8_u35(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanf8_u35(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinf8_u35(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_acosf8_u35(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanf8_u35(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u35(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_atan2f8_u35(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_logf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_cbrtf8_u35(__m256);
+IMPORT CONST __m256 Sleef_sinf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_cosf8_u10(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanf8_u10(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinf8_u10(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_acosf8_u10(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanf8_u10(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u10(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_atan2f8_u10(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_logf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_cbrtf8_u10(__m256);
+IMPORT CONST __m256 Sleef_expf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_expf8_u10(__m256);
+IMPORT CONST __m256 Sleef_powf8_u10(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_powf8_u10(__m256, __m256);
+IMPORT CONST __m256 Sleef_sinhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_coshf8_u10(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_sinhf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinhf8_u35(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_coshf8_u35(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanhf8_u35(__m256);
+IMPORT CONST __m256 Sleef_fastsinf8_u3500(__m256);
+IMPORT CONST __m256 Sleef_cinz_fastsinf8_u3500(__m256);
+IMPORT CONST __m256 Sleef_fastcosf8_u3500(__m256);
+IMPORT CONST __m256 Sleef_cinz_fastcosf8_u3500(__m256);
+IMPORT CONST __m256 Sleef_fastpowf8_u3500(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fastpowf8_u3500(__m256, __m256);
+IMPORT CONST __m256 Sleef_asinhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_acoshf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_acoshf8_u10(__m256);
+IMPORT CONST __m256 Sleef_atanhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanhf8_u10(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp2f8_u10(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp2f8_u35(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp10f8_u10(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp10f8_u35(__m256);
+IMPORT CONST __m256 Sleef_expm1f8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_expm1f8_u10(__m256);
+IMPORT CONST __m256 Sleef_log10f8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_log10f8_u10(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_log2f8_u10(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_log2f8_u35(__m256);
+IMPORT CONST __m256 Sleef_log1pf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_log1pf8_u10(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35(__m256);
+IMPORT CONST __m256 Sleef_sinpif8_u05(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinpif8_u05(__m256);
+IMPORT CONST __m256 Sleef_cospif8_u05(__m256);
+IMPORT CONST __m256 Sleef_cinz_cospif8_u05(__m256);
+IMPORT CONST __m256 Sleef_fmaf8(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmaf8(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_sqrtf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u05(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8_u05(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u35(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8_u35(__m256);
+IMPORT CONST __m256 Sleef_hypotf8_u05(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_hypotf8_u05(__m256, __m256);
+IMPORT CONST __m256 Sleef_hypotf8_u35(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_hypotf8_u35(__m256, __m256);
+IMPORT CONST __m256 Sleef_fabsf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_fabsf8(__m256);
+IMPORT CONST __m256 Sleef_copysignf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_copysignf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_fmaxf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmaxf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_fminf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fminf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_fdimf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fdimf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_truncf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_truncf8(__m256);
+IMPORT CONST __m256 Sleef_floorf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_floorf8(__m256);
+IMPORT CONST __m256 Sleef_ceilf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_ceilf8(__m256);
+IMPORT CONST __m256 Sleef_roundf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_roundf8(__m256);
+IMPORT CONST __m256 Sleef_rintf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_rintf8(__m256);
+IMPORT CONST __m256 Sleef_nextafterf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_nextafterf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_frfrexpf8(__m256);
+IMPORT CONST __m256 Sleef_cinz_frfrexpf8(__m256);
+IMPORT CONST __m256 Sleef_fmodf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmodf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_remainderf8(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_remainderf8(__m256, __m256);
+IMPORT CONST Sleef___m256_2 Sleef_modff8(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_modff8(__m256);
+IMPORT CONST __m256 Sleef_lgammaf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_lgammaf8_u10(__m256);
+IMPORT CONST __m256 Sleef_tgammaf8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_tgammaf8_u10(__m256);
+IMPORT CONST __m256 Sleef_erff8_u10(__m256);
+IMPORT CONST __m256 Sleef_cinz_erff8_u10(__m256);
+IMPORT CONST __m256 Sleef_erfcf8_u15(__m256);
+IMPORT CONST __m256 Sleef_cinz_erfcf8_u15(__m256);
+IMPORT CONST int Sleef_getIntf8(int);
+IMPORT CONST int Sleef_cinz_getIntf8(int);
+IMPORT CONST void *Sleef_getPtrf8(int);
+IMPORT CONST void *Sleef_cinz_getPtrf8(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+IMPORT CONST __m256d Sleef_sind4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sind4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cosd4_u35avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tand4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asind4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acosd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atand4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u35avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_atan2d4_u35avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_logd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cbrtd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_sind4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sind4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cosd4_u10avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincosd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tand4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asind4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acosd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atand4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u10avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_atan2d4_u10avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_logd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cbrtd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_expd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_expd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_powd4_u10avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_powd4_u10avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_coshd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tanhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinhd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_coshd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tanhd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_fastsind4_u3500avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fastsind4_u3500avx(__m256d);
+IMPORT CONST __m256d Sleef_fastcosd4_u3500avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fastcosd4_u3500avx(__m256d);
+IMPORT CONST __m256d Sleef_fastpowd4_u3500avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fastpowd4_u3500avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_asinhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_asinhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_acoshd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_acoshd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_atanhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_atanhd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp2d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp2d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp10d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_exp10d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_expm1d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_expm1d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_log10d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log10d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log2d4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log2d4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_log1pd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_log1pd4_u10avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u05avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_sincospid4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_sinpid4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sinpid4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_cospid4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_cospid4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_ldexpd4_avx(__m256d, __m128i);
+IMPORT CONST __m256d Sleef_cinz_ldexpd4_avx(__m256d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd4_avx(__m256d);
+IMPORT CONST __m128i Sleef_cinz_ilogbd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_fmad4_avx(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmad4_avx(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4_u05avx(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_sqrtd4_u35avx(__m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u05avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_hypotd4_u05avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u35avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_hypotd4_u35avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fabsd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_fabsd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_copysignd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_copysignd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmaxd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmaxd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmind4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmind4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fdimd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fdimd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_truncd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_truncd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_floord4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_floord4_avx(__m256d);
+IMPORT CONST __m256d Sleef_ceild4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_ceild4_avx(__m256d);
+IMPORT CONST __m256d Sleef_roundd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_roundd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_rintd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_rintd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_nextafterd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_nextafterd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_frfrexpd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_frfrexpd4_avx(__m256d);
+IMPORT CONST __m128i Sleef_expfrexpd4_avx(__m256d);
+IMPORT CONST __m128i Sleef_cinz_expfrexpd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_fmodd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_fmodd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_remainderd4_avx(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_cinz_remainderd4_avx(__m256d, __m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_modfd4_avx(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_cinz_modfd4_avx(__m256d);
+IMPORT CONST __m256d Sleef_lgammad4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_lgammad4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_tgammad4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_tgammad4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_erfd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_erfd4_u10avx(__m256d);
+IMPORT CONST __m256d Sleef_erfcd4_u15avx(__m256d);
+IMPORT CONST __m256d Sleef_cinz_erfcd4_u15avx(__m256d);
+IMPORT CONST int Sleef_getIntd4_avx(int);
+IMPORT CONST void *Sleef_getPtrd4_avx(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+IMPORT CONST __m256 Sleef_sinf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_cosf8_u35avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_acosf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u35avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_atan2f8_u35avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_logf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_cbrtf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_sinf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_cosf8_u10avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincosf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_acosf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u10avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_atan2f8_u10avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_logf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_cbrtf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_expf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_expf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_powf8_u10avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_powf8_u10avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_sinhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_coshf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_sinhf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinhf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_coshf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_tanhf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_fastsinf8_u3500avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_fastsinf8_u3500avx(__m256);
+IMPORT CONST __m256 Sleef_fastcosf8_u3500avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_fastcosf8_u3500avx(__m256);
+IMPORT CONST __m256 Sleef_fastpowf8_u3500avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fastpowf8_u3500avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_asinhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_asinhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_acoshf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_acoshf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_atanhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_atanhf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp2f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp2f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp10f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_exp10f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_expm1f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_expm1f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_log10f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_log10f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_log2f8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_log2f8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_log1pf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_log1pf8_u10avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u05avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_sincospif8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_sinpif8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sinpif8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_cospif8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_cospif8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_fmaf8_avx(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmaf8_avx(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_sqrtf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8_avx(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8_u05avx(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_sqrtf8_u35avx(__m256);
+IMPORT CONST __m256 Sleef_hypotf8_u05avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_hypotf8_u05avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_hypotf8_u35avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_hypotf8_u35avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_fabsf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_fabsf8_avx(__m256);
+IMPORT CONST __m256 Sleef_copysignf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_copysignf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_fmaxf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmaxf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_fminf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fminf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_fdimf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fdimf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_truncf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_truncf8_avx(__m256);
+IMPORT CONST __m256 Sleef_floorf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_floorf8_avx(__m256);
+IMPORT CONST __m256 Sleef_ceilf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_ceilf8_avx(__m256);
+IMPORT CONST __m256 Sleef_roundf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_roundf8_avx(__m256);
+IMPORT CONST __m256 Sleef_rintf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_rintf8_avx(__m256);
+IMPORT CONST __m256 Sleef_nextafterf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_nextafterf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_frfrexpf8_avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_frfrexpf8_avx(__m256);
+IMPORT CONST __m256 Sleef_fmodf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_fmodf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_remainderf8_avx(__m256, __m256);
+IMPORT CONST __m256 Sleef_cinz_remainderf8_avx(__m256, __m256);
+IMPORT CONST Sleef___m256_2 Sleef_modff8_avx(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_cinz_modff8_avx(__m256);
+IMPORT CONST __m256 Sleef_lgammaf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_lgammaf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_tgammaf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_tgammaf8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_erff8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_erff8_u10avx(__m256);
+IMPORT CONST __m256 Sleef_erfcf8_u15avx(__m256);
+IMPORT CONST __m256 Sleef_cinz_erfcf8_u15avx(__m256);
+IMPORT CONST int Sleef_getIntf8_avx(int);
+IMPORT CONST int Sleef_cinz_getIntf8_avx(int);
+IMPORT CONST void *Sleef_getPtrf8_avx(int);
+IMPORT CONST void *Sleef_cinz_getPtrf8_avx(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+IMPORT CONST __m256d Sleef_sind4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sind4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_cosd4_u35fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_tand4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_asind4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_acosd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_atand4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u35fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_atan2d4_u35fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_logd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_cbrtd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_sind4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sind4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_cosd4_u10fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_tand4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_asind4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_acosd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_atand4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u10fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_atan2d4_u10fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_logd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_cbrtd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_expd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_expd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_powd4_u10fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_powd4_u10fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_coshd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_tanhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinhd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_coshd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_tanhd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_fastsind4_u3500fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_fastsind4_u3500fma4(__m256d);
+IMPORT CONST __m256d Sleef_fastcosd4_u3500fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_fastcosd4_u3500fma4(__m256d);
+IMPORT CONST __m256d Sleef_fastpowd4_u3500fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fastpowd4_u3500fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_asinhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_asinhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_acoshd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_acoshd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_atanhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_atanhd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp2d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp2d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp10d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp10d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_expm1d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_expm1d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_log10d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_log10d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_log2d4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_log2d4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_log1pd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_log1pd4_u10fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_sinpid4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinpid4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_cospid4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_cospid4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_ldexpd4_fma4(__m256d, __m128i);
+IMPORT CONST __m256d Sleef_finz_ldexpd4_fma4(__m256d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd4_fma4(__m256d);
+IMPORT CONST __m128i Sleef_finz_ilogbd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_fmad4_fma4(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmad4_fma4(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_u05fma4(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_u35fma4(__m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u05fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_hypotd4_u05fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u35fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_hypotd4_u35fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fabsd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_fabsd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_copysignd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_copysignd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmaxd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmaxd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmind4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmind4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fdimd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fdimd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_truncd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_truncd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_floord4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_floord4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_ceild4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_ceild4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_roundd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_roundd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_rintd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_rintd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_nextafterd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_nextafterd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_frfrexpd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_frfrexpd4_fma4(__m256d);
+IMPORT CONST __m128i Sleef_expfrexpd4_fma4(__m256d);
+IMPORT CONST __m128i Sleef_finz_expfrexpd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_fmodd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmodd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_remainderd4_fma4(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_remainderd4_fma4(__m256d, __m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_modfd4_fma4(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_modfd4_fma4(__m256d);
+IMPORT CONST __m256d Sleef_lgammad4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_lgammad4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_tgammad4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_tgammad4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_erfd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_erfd4_u10fma4(__m256d);
+IMPORT CONST __m256d Sleef_erfcd4_u15fma4(__m256d);
+IMPORT CONST __m256d Sleef_finz_erfcd4_u15fma4(__m256d);
+IMPORT CONST int Sleef_getIntd4_fma4(int);
+IMPORT CONST void *Sleef_getPtrd4_fma4(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+IMPORT CONST __m256 Sleef_sinf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sinf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_cosf8_u35fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_tanf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_asinf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_acosf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_atanf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u35fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_atan2f8_u35fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_logf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_cbrtf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_sinf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sinf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_cosf8_u10fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_tanf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_asinf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_acosf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_atanf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u10fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_atan2f8_u10fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_logf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_cbrtf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_expf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_expf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_powf8_u10fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_powf8_u10fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_sinhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sinhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_coshf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_tanhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_sinhf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sinhf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_coshf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_tanhf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_fastsinf8_u3500fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_fastsinf8_u3500fma4(__m256);
+IMPORT CONST __m256 Sleef_fastcosf8_u3500fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_fastcosf8_u3500fma4(__m256);
+IMPORT CONST __m256 Sleef_fastpowf8_u3500fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fastpowf8_u3500fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_asinhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_asinhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_acoshf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_acoshf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_atanhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_atanhf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_exp2f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_exp2f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_exp10f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_exp10f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_expm1f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_expm1f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_log10f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_log10f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_log2f8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_log2f8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_log1pf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_log1pf8_u10fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u05fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_sinpif8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sinpif8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_cospif8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_cospif8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_fmaf8_fma4(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmaf8_fma4(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_sqrtf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_u05fma4(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_u35fma4(__m256);
+IMPORT CONST __m256 Sleef_hypotf8_u05fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_hypotf8_u05fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_hypotf8_u35fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_hypotf8_u35fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_fabsf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_fabsf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_copysignf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_copysignf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_fmaxf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmaxf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_fminf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fminf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_fdimf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fdimf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_truncf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_truncf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_floorf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_floorf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_ceilf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_ceilf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_roundf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_roundf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_rintf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_rintf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_nextafterf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_nextafterf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_frfrexpf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_frfrexpf8_fma4(__m256);
+IMPORT CONST __m256 Sleef_fmodf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmodf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_remainderf8_fma4(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_remainderf8_fma4(__m256, __m256);
+IMPORT CONST Sleef___m256_2 Sleef_modff8_fma4(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_modff8_fma4(__m256);
+IMPORT CONST __m256 Sleef_lgammaf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_lgammaf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_tgammaf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_tgammaf8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_erff8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_erff8_u10fma4(__m256);
+IMPORT CONST __m256 Sleef_erfcf8_u15fma4(__m256);
+IMPORT CONST __m256 Sleef_finz_erfcf8_u15fma4(__m256);
+IMPORT CONST int Sleef_getIntf8_fma4(int);
+IMPORT CONST int Sleef_finz_getIntf8_fma4(int);
+IMPORT CONST void *Sleef_getPtrf8_fma4(int);
+IMPORT CONST void *Sleef_finz_getPtrf8_fma4(int);
+#endif
+#ifdef __AVX__
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct {
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+
+IMPORT CONST __m256d Sleef_sind4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sind4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_cosd4_u35avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u35avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_tand4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_asind4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_acosd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_atand4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u35avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_atan2d4_u35avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_logd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_cbrtd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_sind4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sind4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_cosd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_cosd4_u10avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincosd4_u10avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincosd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_tand4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_tand4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_asind4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_asind4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_acosd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_acosd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_atand4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_atand4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_atan2d4_u10avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_atan2d4_u10avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_logd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_logd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_cbrtd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_cbrtd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_expd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_expd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_powd4_u10avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_powd4_u10avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_coshd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_tanhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_sinhd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinhd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_coshd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_coshd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_tanhd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_tanhd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_fastsind4_u3500avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_fastsind4_u3500avx2(__m256d);
+IMPORT CONST __m256d Sleef_fastcosd4_u3500avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_fastcosd4_u3500avx2(__m256d);
+IMPORT CONST __m256d Sleef_fastpowd4_u3500avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fastpowd4_u3500avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_asinhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_asinhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_acoshd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_acoshd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_atanhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_atanhd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp2d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_exp2d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp2d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp10d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_exp10d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_exp10d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_expm1d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_expm1d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_log10d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_log10d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_log2d4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_log2d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_log2d4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_log1pd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_log1pd4_u10avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u05avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u05avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_sincospid4_u35avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_sincospid4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_sinpid4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sinpid4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_cospid4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_cospid4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_ldexpd4_avx2(__m256d, __m128i);
+IMPORT CONST __m256d Sleef_finz_ldexpd4_avx2(__m256d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd4_avx2(__m256d);
+IMPORT CONST __m128i Sleef_finz_ilogbd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_fmad4_avx2(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmad4_avx2(__m256d, __m256d, __m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_u05avx2(__m256d);
+IMPORT CONST __m256d Sleef_sqrtd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_sqrtd4_u35avx2(__m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u05avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_hypotd4_u05avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_hypotd4_u35avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_hypotd4_u35avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fabsd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_fabsd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_copysignd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_copysignd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmaxd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmaxd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fmind4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmind4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_fdimd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fdimd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_truncd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_truncd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_floord4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_floord4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_ceild4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_ceild4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_roundd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_roundd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_rintd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_rintd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_nextafterd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_nextafterd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_frfrexpd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_frfrexpd4_avx2(__m256d);
+IMPORT CONST __m128i Sleef_expfrexpd4_avx2(__m256d);
+IMPORT CONST __m128i Sleef_finz_expfrexpd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_fmodd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_fmodd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_remainderd4_avx2(__m256d, __m256d);
+IMPORT CONST __m256d Sleef_finz_remainderd4_avx2(__m256d, __m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_modfd4_avx2(__m256d);
+IMPORT CONST Sleef___m256d_2 Sleef_finz_modfd4_avx2(__m256d);
+IMPORT CONST __m256d Sleef_lgammad4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_lgammad4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_tgammad4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_tgammad4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_erfd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_erfd4_u10avx2(__m256d);
+IMPORT CONST __m256d Sleef_erfcd4_u15avx2(__m256d);
+IMPORT CONST __m256d Sleef_finz_erfcd4_u15avx2(__m256d);
+IMPORT CONST int Sleef_getIntd4_avx2(int);
+IMPORT CONST void *Sleef_getPtrd4_avx2(int);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct {
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+
+IMPORT CONST __m256 Sleef_sinf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sinf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_cosf8_u35avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u35avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_tanf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_asinf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_acosf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_atanf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u35avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_atan2f8_u35avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_logf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_cbrtf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_sinf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sinf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_cosf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_cosf8_u10avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincosf8_u10avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincosf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_tanf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_tanf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_asinf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_asinf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_acosf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_acosf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_atanf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_atanf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_atan2f8_u10avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_atan2f8_u10avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_logf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_logf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_cbrtf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_cbrtf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_expf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_expf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_powf8_u10avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_powf8_u10avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_sinhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sinhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_coshf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_tanhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_sinhf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sinhf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_coshf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_coshf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_tanhf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_tanhf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_fastsinf8_u3500avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_fastsinf8_u3500avx2(__m256);
+IMPORT CONST __m256 Sleef_fastcosf8_u3500avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_fastcosf8_u3500avx2(__m256);
+IMPORT CONST __m256 Sleef_fastpowf8_u3500avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fastpowf8_u3500avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_asinhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_asinhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_acoshf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_acoshf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_atanhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_atanhf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_exp2f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_exp2f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_exp2f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_exp10f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_exp10f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_exp10f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_expm1f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_expm1f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_log10f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_log10f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_log2f8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_log2f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_log2f8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_log1pf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_log1pf8_u10avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u05avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u05avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_sincospif8_u35avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_sincospif8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_sinpif8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sinpif8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_cospif8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_cospif8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_fmaf8_avx2(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmaf8_avx2(__m256, __m256, __m256);
+IMPORT CONST __m256 Sleef_sqrtf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_u05avx2(__m256);
+IMPORT CONST __m256 Sleef_sqrtf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_sqrtf8_u35avx2(__m256);
+IMPORT CONST __m256 Sleef_hypotf8_u05avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_hypotf8_u05avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_hypotf8_u35avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_hypotf8_u35avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_fabsf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_fabsf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_copysignf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_copysignf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_fmaxf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmaxf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_fminf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fminf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_fdimf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fdimf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_truncf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_truncf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_floorf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_floorf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_ceilf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_ceilf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_roundf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_roundf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_rintf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_rintf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_nextafterf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_nextafterf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_frfrexpf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_frfrexpf8_avx2(__m256);
+IMPORT CONST __m256 Sleef_fmodf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_fmodf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_remainderf8_avx2(__m256, __m256);
+IMPORT CONST __m256 Sleef_finz_remainderf8_avx2(__m256, __m256);
+IMPORT CONST Sleef___m256_2 Sleef_modff8_avx2(__m256);
+IMPORT CONST Sleef___m256_2 Sleef_finz_modff8_avx2(__m256);
+IMPORT CONST __m256 Sleef_lgammaf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_lgammaf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_tgammaf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_tgammaf8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_erff8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_erff8_u10avx2(__m256);
+IMPORT CONST __m256 Sleef_erfcf8_u15avx2(__m256);
+IMPORT CONST __m256 Sleef_finz_erfcf8_u15avx2(__m256);
+IMPORT CONST int Sleef_getIntf8_avx2(int);
+IMPORT CONST int Sleef_finz_getIntf8_avx2(int);
+IMPORT CONST void *Sleef_getPtrf8_avx2(int);
+IMPORT CONST void *Sleef_finz_getPtrf8_avx2(int);
+#endif
+#ifdef __SSE2__
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct {
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+
+IMPORT CONST __m128d Sleef_sind2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sind2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_cosd2_u35avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u35avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_finz_sincosd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_tand2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_asind2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_acosd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_atand2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u35avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_atan2d2_u35avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_logd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_cbrtd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_sind2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sind2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_cosd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_cosd2_u10avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincosd2_u10avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_finz_sincosd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_tand2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_tand2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_asind2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_asind2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_acosd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_acosd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_atand2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_atand2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_atan2d2_u10avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_atan2d2_u10avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_logd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_logd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_cbrtd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_cbrtd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_expd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_expd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_powd2_u10avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_powd2_u10avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sinhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_coshd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_tanhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_sinhd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sinhd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_coshd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_coshd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_tanhd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_tanhd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_fastsind2_u3500avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_fastsind2_u3500avx2128(__m128d);
+IMPORT CONST __m128d Sleef_fastcosd2_u3500avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_fastcosd2_u3500avx2128(__m128d);
+IMPORT CONST __m128d Sleef_fastpowd2_u3500avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fastpowd2_u3500avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_asinhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_asinhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_acoshd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_acoshd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_atanhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_atanhd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_exp2d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_exp2d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_exp2d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_exp10d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_exp10d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_exp10d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_expm1d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_expm1d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_log10d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_log10d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_log2d2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_log2d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_log2d2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_log1pd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_log1pd2_u10avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u05avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_finz_sincospid2_u05avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_sincospid2_u35avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_finz_sincospid2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_sinpid2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sinpid2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_cospid2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_cospid2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_ldexpd2_avx2128(__m128d, __m128i);
+IMPORT CONST __m128d Sleef_finz_ldexpd2_avx2128(__m128d, __m128i);
+IMPORT CONST __m128i Sleef_ilogbd2_avx2128(__m128d);
+IMPORT CONST __m128i Sleef_finz_ilogbd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_fmad2_avx2128(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fmad2_avx2128(__m128d, __m128d, __m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sqrtd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sqrtd2_u05avx2128(__m128d);
+IMPORT CONST __m128d Sleef_sqrtd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_sqrtd2_u35avx2128(__m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u05avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_hypotd2_u05avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_hypotd2_u35avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_hypotd2_u35avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fabsd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_fabsd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_copysignd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_copysignd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmaxd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fmaxd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fmind2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fmind2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_fdimd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fdimd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_truncd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_truncd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_floord2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_floord2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_ceild2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_ceild2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_roundd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_roundd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_rintd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_rintd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_nextafterd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_nextafterd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_frfrexpd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_frfrexpd2_avx2128(__m128d);
+IMPORT CONST __m128i Sleef_expfrexpd2_avx2128(__m128d);
+IMPORT CONST __m128i Sleef_finz_expfrexpd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_fmodd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_fmodd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_remainderd2_avx2128(__m128d, __m128d);
+IMPORT CONST __m128d Sleef_finz_remainderd2_avx2128(__m128d, __m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_modfd2_avx2128(__m128d);
+IMPORT CONST Sleef___m128d_2 Sleef_finz_modfd2_avx2128(__m128d);
+IMPORT CONST __m128d Sleef_lgammad2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_lgammad2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_tgammad2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_tgammad2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_erfd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_erfd2_u10avx2128(__m128d);
+IMPORT CONST __m128d Sleef_erfcd2_u15avx2128(__m128d);
+IMPORT CONST __m128d Sleef_finz_erfcd2_u15avx2128(__m128d);
+IMPORT CONST int Sleef_getIntd2_avx2128(int);
+IMPORT CONST void *Sleef_getPtrd2_avx2128(int);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct {
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+
+IMPORT CONST __m128 Sleef_sinf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sinf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_cosf4_u35avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u35avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_finz_sincosf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_tanf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_asinf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_acosf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_atanf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u35avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_atan2f4_u35avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_logf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_cbrtf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_sinf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sinf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_cosf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_cosf4_u10avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincosf4_u10avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_finz_sincosf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_tanf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_tanf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_asinf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_asinf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_acosf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_acosf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_atanf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_atanf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_atan2f4_u10avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_atan2f4_u10avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_logf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_logf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_cbrtf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_cbrtf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_expf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_expf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_powf4_u10avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_powf4_u10avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_sinhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sinhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_coshf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_tanhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_sinhf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sinhf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_coshf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_coshf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_tanhf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_tanhf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_fastsinf4_u3500avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_fastsinf4_u3500avx2128(__m128);
+IMPORT CONST __m128 Sleef_fastcosf4_u3500avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_fastcosf4_u3500avx2128(__m128);
+IMPORT CONST __m128 Sleef_fastpowf4_u3500avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fastpowf4_u3500avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_asinhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_asinhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_acoshf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_acoshf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_atanhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_atanhf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_exp2f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_exp2f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_exp2f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_exp10f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_exp10f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_exp10f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_expm1f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_expm1f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_log10f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_log10f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_log2f4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_log2f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_log2f4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_log1pf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_log1pf4_u10avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u05avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_finz_sincospif4_u05avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_sincospif4_u35avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_finz_sincospif4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_sinpif4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sinpif4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_cospif4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_cospif4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_fmaf4_avx2128(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fmaf4_avx2128(__m128, __m128, __m128);
+IMPORT CONST __m128 Sleef_sqrtf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sqrtf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sqrtf4_u05avx2128(__m128);
+IMPORT CONST __m128 Sleef_sqrtf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_sqrtf4_u35avx2128(__m128);
+IMPORT CONST __m128 Sleef_hypotf4_u05avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_hypotf4_u05avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_hypotf4_u35avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_hypotf4_u35avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_fabsf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_fabsf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_copysignf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_copysignf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_fmaxf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fmaxf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_fminf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fminf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_fdimf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fdimf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_truncf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_truncf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_floorf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_floorf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_ceilf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_ceilf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_roundf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_roundf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_rintf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_rintf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_nextafterf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_nextafterf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_frfrexpf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_frfrexpf4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_fmodf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_fmodf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_remainderf4_avx2128(__m128, __m128);
+IMPORT CONST __m128 Sleef_finz_remainderf4_avx2128(__m128, __m128);
+IMPORT CONST Sleef___m128_2 Sleef_modff4_avx2128(__m128);
+IMPORT CONST Sleef___m128_2 Sleef_finz_modff4_avx2128(__m128);
+IMPORT CONST __m128 Sleef_lgammaf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_lgammaf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_tgammaf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_tgammaf4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_erff4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_erff4_u10avx2128(__m128);
+IMPORT CONST __m128 Sleef_erfcf4_u15avx2128(__m128);
+IMPORT CONST __m128 Sleef_finz_erfcf4_u15avx2128(__m128);
+IMPORT CONST int Sleef_getIntf4_avx2128(int);
+IMPORT CONST int Sleef_finz_getIntf4_avx2128(int);
+IMPORT CONST void *Sleef_getPtrf4_avx2128(int);
+IMPORT CONST void *Sleef_finz_getPtrf4_avx2128(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+IMPORT CONST __m512d Sleef_sind8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_sind8_u35(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_cosd8_u35(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_tand8_u35(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_asind8_u35(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_acosd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_atand8_u35(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u35(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_atan2d8_u35(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_logd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_cbrtd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_sind8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_sind8_u10(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_cosd8_u10(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_tand8_u10(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_asind8_u10(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_acosd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_atand8_u10(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u10(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_atan2d8_u10(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_logd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_cbrtd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_expd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_expd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_powd8_u10(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_powd8_u10(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_coshd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_tanhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinhd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_coshd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_tanhd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_fastsind8_u3500(__m512d);
+IMPORT CONST __m512d Sleef_finz_fastsind8_u3500(__m512d);
+IMPORT CONST __m512d Sleef_fastcosd8_u3500(__m512d);
+IMPORT CONST __m512d Sleef_finz_fastcosd8_u3500(__m512d);
+IMPORT CONST __m512d Sleef_fastpowd8_u3500(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fastpowd8_u3500(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_asinhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_asinhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_acoshd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_acoshd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_atanhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_atanhd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp2d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp2d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp10d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp10d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_expm1d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_expm1d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_log10d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_log10d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_log2d8_u10(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_log2d8_u35(__m512d);
+IMPORT CONST __m512d Sleef_log1pd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_log1pd8_u10(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35(__m512d);
+IMPORT CONST __m512d Sleef_sinpid8_u05(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinpid8_u05(__m512d);
+IMPORT CONST __m512d Sleef_cospid8_u05(__m512d);
+IMPORT CONST __m512d Sleef_finz_cospid8_u05(__m512d);
+IMPORT CONST __m512d Sleef_ldexpd8(__m512d, __m256i);
+IMPORT CONST __m512d Sleef_finz_ldexpd8(__m512d, __m256i);
+IMPORT CONST __m256i Sleef_ilogbd8(__m512d);
+IMPORT CONST __m256i Sleef_finz_ilogbd8(__m512d);
+IMPORT CONST __m512d Sleef_fmad8(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmad8(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_sqrtd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u05(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8_u05(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8_u35(__m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u05(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_hypotd8_u05(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u35(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_hypotd8_u35(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fabsd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_fabsd8(__m512d);
+IMPORT CONST __m512d Sleef_copysignd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_copysignd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmaxd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmaxd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmind8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmind8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fdimd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fdimd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_truncd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_truncd8(__m512d);
+IMPORT CONST __m512d Sleef_floord8(__m512d);
+IMPORT CONST __m512d Sleef_finz_floord8(__m512d);
+IMPORT CONST __m512d Sleef_ceild8(__m512d);
+IMPORT CONST __m512d Sleef_finz_ceild8(__m512d);
+IMPORT CONST __m512d Sleef_roundd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_roundd8(__m512d);
+IMPORT CONST __m512d Sleef_rintd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_rintd8(__m512d);
+IMPORT CONST __m512d Sleef_nextafterd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_nextafterd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_frfrexpd8(__m512d);
+IMPORT CONST __m512d Sleef_finz_frfrexpd8(__m512d);
+IMPORT CONST __m256i Sleef_expfrexpd8(__m512d);
+IMPORT CONST __m256i Sleef_finz_expfrexpd8(__m512d);
+IMPORT CONST __m512d Sleef_fmodd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmodd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_remainderd8(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_remainderd8(__m512d, __m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_modfd8(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_modfd8(__m512d);
+IMPORT CONST __m512d Sleef_lgammad8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_lgammad8_u10(__m512d);
+IMPORT CONST __m512d Sleef_tgammad8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_tgammad8_u10(__m512d);
+IMPORT CONST __m512d Sleef_erfd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_finz_erfd8_u10(__m512d);
+IMPORT CONST __m512d Sleef_erfcd8_u15(__m512d);
+IMPORT CONST __m512d Sleef_finz_erfcd8_u15(__m512d);
+IMPORT CONST int Sleef_getIntd8(int);
+IMPORT CONST void *Sleef_getPtrd8(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+IMPORT CONST __m512 Sleef_sinf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_sinf16_u35(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_cosf16_u35(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u35(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_tanf16_u35(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_asinf16_u35(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_acosf16_u35(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_atanf16_u35(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u35(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_atan2f16_u35(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_logf16_u35(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_cbrtf16_u35(__m512);
+IMPORT CONST __m512 Sleef_sinf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_sinf16_u10(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_cosf16_u10(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u10(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_tanf16_u10(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_asinf16_u10(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_acosf16_u10(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_atanf16_u10(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u10(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_atan2f16_u10(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_logf16_u10(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_cbrtf16_u10(__m512);
+IMPORT CONST __m512 Sleef_expf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_expf16_u10(__m512);
+IMPORT CONST __m512 Sleef_powf16_u10(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_powf16_u10(__m512, __m512);
+IMPORT CONST __m512 Sleef_sinhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_sinhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_coshf16_u10(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_tanhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_sinhf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_sinhf16_u35(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_coshf16_u35(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_tanhf16_u35(__m512);
+IMPORT CONST __m512 Sleef_fastsinf16_u3500(__m512);
+IMPORT CONST __m512 Sleef_finz_fastsinf16_u3500(__m512);
+IMPORT CONST __m512 Sleef_fastcosf16_u3500(__m512);
+IMPORT CONST __m512 Sleef_finz_fastcosf16_u3500(__m512);
+IMPORT CONST __m512 Sleef_fastpowf16_u3500(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fastpowf16_u3500(__m512, __m512);
+IMPORT CONST __m512 Sleef_asinhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_asinhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_acoshf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_acoshf16_u10(__m512);
+IMPORT CONST __m512 Sleef_atanhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_atanhf16_u10(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_exp2f16_u10(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_exp2f16_u35(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_exp10f16_u10(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_exp10f16_u35(__m512);
+IMPORT CONST __m512 Sleef_expm1f16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_expm1f16_u10(__m512);
+IMPORT CONST __m512 Sleef_log10f16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_log10f16_u10(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_log2f16_u10(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_log2f16_u35(__m512);
+IMPORT CONST __m512 Sleef_log1pf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_log1pf16_u10(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u05(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u35(__m512);
+IMPORT CONST __m512 Sleef_sinpif16_u05(__m512);
+IMPORT CONST __m512 Sleef_finz_sinpif16_u05(__m512);
+IMPORT CONST __m512 Sleef_cospif16_u05(__m512);
+IMPORT CONST __m512 Sleef_finz_cospif16_u05(__m512);
+IMPORT CONST __m512 Sleef_fmaf16(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmaf16(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_sqrtf16(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u05(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16_u05(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u35(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16_u35(__m512);
+IMPORT CONST __m512 Sleef_hypotf16_u05(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_hypotf16_u05(__m512, __m512);
+IMPORT CONST __m512 Sleef_hypotf16_u35(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_hypotf16_u35(__m512, __m512);
+IMPORT CONST __m512 Sleef_fabsf16(__m512);
+IMPORT CONST __m512 Sleef_finz_fabsf16(__m512);
+IMPORT CONST __m512 Sleef_copysignf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_copysignf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_fmaxf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmaxf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_fminf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fminf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_fdimf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fdimf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_truncf16(__m512);
+IMPORT CONST __m512 Sleef_finz_truncf16(__m512);
+IMPORT CONST __m512 Sleef_floorf16(__m512);
+IMPORT CONST __m512 Sleef_finz_floorf16(__m512);
+IMPORT CONST __m512 Sleef_ceilf16(__m512);
+IMPORT CONST __m512 Sleef_finz_ceilf16(__m512);
+IMPORT CONST __m512 Sleef_roundf16(__m512);
+IMPORT CONST __m512 Sleef_finz_roundf16(__m512);
+IMPORT CONST __m512 Sleef_rintf16(__m512);
+IMPORT CONST __m512 Sleef_finz_rintf16(__m512);
+IMPORT CONST __m512 Sleef_nextafterf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_nextafterf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_frfrexpf16(__m512);
+IMPORT CONST __m512 Sleef_finz_frfrexpf16(__m512);
+IMPORT CONST __m512 Sleef_fmodf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmodf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_remainderf16(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_remainderf16(__m512, __m512);
+IMPORT CONST Sleef___m512_2 Sleef_modff16(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_modff16(__m512);
+IMPORT CONST __m512 Sleef_lgammaf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_lgammaf16_u10(__m512);
+IMPORT CONST __m512 Sleef_tgammaf16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_tgammaf16_u10(__m512);
+IMPORT CONST __m512 Sleef_erff16_u10(__m512);
+IMPORT CONST __m512 Sleef_finz_erff16_u10(__m512);
+IMPORT CONST __m512 Sleef_erfcf16_u15(__m512);
+IMPORT CONST __m512 Sleef_finz_erfcf16_u15(__m512);
+IMPORT CONST int Sleef_getIntf16(int);
+IMPORT CONST int Sleef_finz_getIntf16(int);
+IMPORT CONST void *Sleef_getPtrf16(int);
+IMPORT CONST void *Sleef_finz_getPtrf16(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+IMPORT CONST __m512d Sleef_sind8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sind8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_cosd8_u35avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_tand8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_asind8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_acosd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_atand8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u35avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_atan2d8_u35avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_logd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_cbrtd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_sind8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sind8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_cosd8_u10avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincosd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_tand8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_asind8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_acosd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_atand8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u10avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_atan2d8_u10avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_logd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_cbrtd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_expd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_expd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_powd8_u10avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_powd8_u10avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_coshd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_tanhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinhd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_coshd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_tanhd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_fastsind8_u3500avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_fastsind8_u3500avx512f(__m512d);
+IMPORT CONST __m512d Sleef_fastcosd8_u3500avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_fastcosd8_u3500avx512f(__m512d);
+IMPORT CONST __m512d Sleef_fastpowd8_u3500avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fastpowd8_u3500avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_asinhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_asinhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_acoshd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_acoshd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_atanhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_atanhd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp2d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp2d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp10d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_exp10d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_expm1d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_expm1d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_log10d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_log10d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_log2d8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_log2d8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_log1pd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_log1pd8_u10avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u05avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_sincospid8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_sinpid8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sinpid8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_cospid8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_cospid8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_ldexpd8_avx512f(__m512d, __m256i);
+IMPORT CONST __m512d Sleef_finz_ldexpd8_avx512f(__m512d, __m256i);
+IMPORT CONST __m256i Sleef_ilogbd8_avx512f(__m512d);
+IMPORT CONST __m256i Sleef_finz_ilogbd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_fmad8_avx512f(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmad8_avx512f(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8_u05avx512f(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_sqrtd8_u35avx512f(__m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u05avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_hypotd8_u05avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u35avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_hypotd8_u35avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fabsd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_fabsd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_copysignd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_copysignd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmaxd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmaxd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmind8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmind8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fdimd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fdimd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_truncd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_truncd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_floord8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_floord8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_ceild8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_ceild8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_roundd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_roundd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_rintd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_rintd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_nextafterd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_nextafterd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_frfrexpd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_frfrexpd8_avx512f(__m512d);
+IMPORT CONST __m256i Sleef_expfrexpd8_avx512f(__m512d);
+IMPORT CONST __m256i Sleef_finz_expfrexpd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_fmodd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_fmodd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_remainderd8_avx512f(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_finz_remainderd8_avx512f(__m512d, __m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_modfd8_avx512f(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_finz_modfd8_avx512f(__m512d);
+IMPORT CONST __m512d Sleef_lgammad8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_lgammad8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_tgammad8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_tgammad8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_erfd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_erfd8_u10avx512f(__m512d);
+IMPORT CONST __m512d Sleef_erfcd8_u15avx512f(__m512d);
+IMPORT CONST __m512d Sleef_finz_erfcd8_u15avx512f(__m512d);
+IMPORT CONST int Sleef_getIntd8_avx512f(int);
+IMPORT CONST void *Sleef_getPtrd8_avx512f(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+IMPORT CONST __m512 Sleef_sinf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sinf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_cosf16_u35avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_tanf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_asinf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_acosf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_atanf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u35avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_atan2f16_u35avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_logf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_cbrtf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_sinf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sinf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_cosf16_u10avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincosf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_tanf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_asinf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_acosf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_atanf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u10avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_atan2f16_u10avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_logf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_cbrtf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_expf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_expf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_powf16_u10avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_powf16_u10avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_sinhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sinhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_coshf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_tanhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_sinhf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sinhf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_coshf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_tanhf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_fastsinf16_u3500avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_fastsinf16_u3500avx512f(__m512);
+IMPORT CONST __m512 Sleef_fastcosf16_u3500avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_fastcosf16_u3500avx512f(__m512);
+IMPORT CONST __m512 Sleef_fastpowf16_u3500avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fastpowf16_u3500avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_asinhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_asinhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_acoshf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_acoshf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_atanhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_atanhf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_exp2f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_exp2f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_exp10f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_exp10f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_expm1f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_expm1f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_log10f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_log10f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_log2f16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_log2f16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_log1pf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_log1pf16_u10avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u05avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_sincospif16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_sinpif16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sinpif16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_cospif16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_cospif16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_fmaf16_avx512f(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmaf16_avx512f(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_sqrtf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16_u05avx512f(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_sqrtf16_u35avx512f(__m512);
+IMPORT CONST __m512 Sleef_hypotf16_u05avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_hypotf16_u05avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_hypotf16_u35avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_hypotf16_u35avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_fabsf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_fabsf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_copysignf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_copysignf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_fmaxf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmaxf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_fminf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fminf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_fdimf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fdimf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_truncf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_truncf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_floorf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_floorf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_ceilf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_ceilf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_roundf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_roundf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_rintf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_rintf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_nextafterf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_nextafterf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_frfrexpf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_frfrexpf16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_fmodf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_fmodf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_remainderf16_avx512f(__m512, __m512);
+IMPORT CONST __m512 Sleef_finz_remainderf16_avx512f(__m512, __m512);
+IMPORT CONST Sleef___m512_2 Sleef_modff16_avx512f(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_finz_modff16_avx512f(__m512);
+IMPORT CONST __m512 Sleef_lgammaf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_lgammaf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_tgammaf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_tgammaf16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_erff16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_erff16_u10avx512f(__m512);
+IMPORT CONST __m512 Sleef_erfcf16_u15avx512f(__m512);
+IMPORT CONST __m512 Sleef_finz_erfcf16_u15avx512f(__m512);
+IMPORT CONST int Sleef_getIntf16_avx512f(int);
+IMPORT CONST int Sleef_finz_getIntf16_avx512f(int);
+IMPORT CONST void *Sleef_getPtrf16_avx512f(int);
+IMPORT CONST void *Sleef_finz_getPtrf16_avx512f(int);
+#endif
+#ifdef __AVX512F__
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct {
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+
+IMPORT CONST __m512d Sleef_sind8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sind8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_cosd8_u35avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u35avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_tand8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_asind8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_acosd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_atand8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u35avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_atan2d8_u35avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_logd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_cbrtd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_sind8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sind8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cosd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_cosd8_u10avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincosd8_u10avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincosd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_tand8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_tand8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_asind8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_asind8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_acosd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_acosd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_atand8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_atand8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_atan2d8_u10avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_atan2d8_u10avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_logd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_logd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cbrtd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_cbrtd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_expd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_expd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_powd8_u10avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_powd8_u10avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sinhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_coshd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_tanhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_sinhd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sinhd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_coshd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_coshd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_tanhd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_tanhd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_fastsind8_u3500avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_fastsind8_u3500avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_fastcosd8_u3500avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_fastcosd8_u3500avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_fastpowd8_u3500avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fastpowd8_u3500avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_asinhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_asinhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_acoshd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_acoshd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_atanhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_atanhd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_exp2d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_exp2d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_exp2d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_exp10d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_exp10d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_exp10d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_expm1d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_expm1d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_log10d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_log10d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_log2d8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_log2d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_log2d8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_log1pd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_log1pd8_u10avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u05avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u05avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_sincospid8_u35avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_cinz_sincospid8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_sinpid8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sinpid8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cospid8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_cospid8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_ldexpd8_avx512fnofma(__m512d, __m256i);
+IMPORT CONST __m512d Sleef_cinz_ldexpd8_avx512fnofma(__m512d, __m256i);
+IMPORT CONST __m256i Sleef_ilogbd8_avx512fnofma(__m512d);
+IMPORT CONST __m256i Sleef_cinz_ilogbd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_fmad8_avx512fnofma(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fmad8_avx512fnofma(__m512d, __m512d, __m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sqrtd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sqrtd8_u05avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_sqrtd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_sqrtd8_u35avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u05avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_hypotd8_u05avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_hypotd8_u35avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_hypotd8_u35avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fabsd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_fabsd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_copysignd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_copysignd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmaxd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fmaxd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fmind8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fmind8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_fdimd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fdimd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_truncd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_truncd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_floord8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_floord8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_ceild8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_ceild8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_roundd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_roundd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_rintd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_rintd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_nextafterd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_nextafterd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_frfrexpd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_frfrexpd8_avx512fnofma(__m512d);
+IMPORT CONST __m256i Sleef_expfrexpd8_avx512fnofma(__m512d);
+IMPORT CONST __m256i Sleef_cinz_expfrexpd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_fmodd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_fmodd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_remainderd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST __m512d Sleef_cinz_remainderd8_avx512fnofma(__m512d, __m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_modfd8_avx512fnofma(__m512d);
+IMPORT CONST Sleef___m512d_2 Sleef_cinz_modfd8_avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_lgammad8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_lgammad8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_tgammad8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_tgammad8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_erfd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_erfd8_u10avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_erfcd8_u15avx512fnofma(__m512d);
+IMPORT CONST __m512d Sleef_cinz_erfcd8_u15avx512fnofma(__m512d);
+IMPORT CONST int Sleef_getIntd8_avx512fnofma(int);
+IMPORT CONST void *Sleef_getPtrd8_avx512fnofma(int);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct {
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+
+IMPORT CONST __m512 Sleef_sinf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sinf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_cosf16_u35avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u35avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_cinz_sincosf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_tanf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_asinf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_acosf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_atanf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u35avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_atan2f16_u35avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_logf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_cbrtf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_sinf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sinf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cosf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_cosf16_u10avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincosf16_u10avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_cinz_sincosf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_tanf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_tanf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_asinf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_asinf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_acosf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_acosf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_atanf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_atanf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_atan2f16_u10avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_atan2f16_u10avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_logf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_logf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cbrtf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_cbrtf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_expf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_expf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_powf16_u10avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_powf16_u10avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_sinhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sinhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_coshf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_tanhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_sinhf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sinhf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_coshf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_coshf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_tanhf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_tanhf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_fastsinf16_u3500avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_fastsinf16_u3500avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_fastcosf16_u3500avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_fastcosf16_u3500avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_fastpowf16_u3500avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fastpowf16_u3500avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_asinhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_asinhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_acoshf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_acoshf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_atanhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_atanhf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_exp2f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_exp2f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_exp2f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_exp10f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_exp10f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_exp10f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_expm1f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_expm1f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_log10f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_log10f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_log2f16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_log2f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_log2f16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_log1pf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_log1pf16_u10avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u05avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_cinz_sincospif16_u05avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_sincospif16_u35avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_cinz_sincospif16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_sinpif16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sinpif16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cospif16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_cospif16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_fmaf16_avx512fnofma(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fmaf16_avx512fnofma(__m512, __m512, __m512);
+IMPORT CONST __m512 Sleef_sqrtf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sqrtf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sqrtf16_u05avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_sqrtf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_sqrtf16_u35avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_hypotf16_u05avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_hypotf16_u05avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_hypotf16_u35avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_hypotf16_u35avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_fabsf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_fabsf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_copysignf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_copysignf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_fmaxf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fmaxf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_fminf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fminf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_fdimf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fdimf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_truncf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_truncf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_floorf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_floorf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_ceilf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_ceilf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_roundf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_roundf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_rintf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_rintf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_nextafterf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_nextafterf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_frfrexpf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_frfrexpf16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_fmodf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_fmodf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_remainderf16_avx512fnofma(__m512, __m512);
+IMPORT CONST __m512 Sleef_cinz_remainderf16_avx512fnofma(__m512, __m512);
+IMPORT CONST Sleef___m512_2 Sleef_modff16_avx512fnofma(__m512);
+IMPORT CONST Sleef___m512_2 Sleef_cinz_modff16_avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_lgammaf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_lgammaf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_tgammaf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_tgammaf16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_erff16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_erff16_u10avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_erfcf16_u15avx512fnofma(__m512);
+IMPORT CONST __m512 Sleef_cinz_erfcf16_u15avx512fnofma(__m512);
+IMPORT CONST int Sleef_getIntf16_avx512fnofma(int);
+IMPORT CONST int Sleef_cinz_getIntf16_avx512fnofma(int);
+IMPORT CONST void *Sleef_getPtrf16_avx512fnofma(int);
+IMPORT CONST void *Sleef_cinz_getPtrf16_avx512fnofma(int);
+#endif
+#ifdef __STDC__
+
+#ifndef Sleef_double_2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double_2;
+#define Sleef_double_2_DEFINED
+#endif
+
+IMPORT CONST double Sleef_sind1_u35purec(double);
+IMPORT CONST double Sleef_cinz_sind1_u35purec(double);
+IMPORT CONST double Sleef_cosd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_cosd1_u35purec(double);
+IMPORT CONST Sleef_double_2 Sleef_sincosd1_u35purec(double);
+IMPORT CONST Sleef_double_2 Sleef_cinz_sincosd1_u35purec(double);
+IMPORT CONST double Sleef_tand1_u35purec(double);
+IMPORT CONST double Sleef_cinz_tand1_u35purec(double);
+IMPORT CONST double Sleef_asind1_u35purec(double);
+IMPORT CONST double Sleef_cinz_asind1_u35purec(double);
+IMPORT CONST double Sleef_acosd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_acosd1_u35purec(double);
+IMPORT CONST double Sleef_atand1_u35purec(double);
+IMPORT CONST double Sleef_cinz_atand1_u35purec(double);
+IMPORT CONST double Sleef_atan2d1_u35purec(double, double);
+IMPORT CONST double Sleef_cinz_atan2d1_u35purec(double, double);
+IMPORT CONST double Sleef_logd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_logd1_u35purec(double);
+IMPORT CONST double Sleef_cbrtd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_cbrtd1_u35purec(double);
+IMPORT CONST double Sleef_sind1_u10purec(double);
+IMPORT CONST double Sleef_cinz_sind1_u10purec(double);
+IMPORT CONST double Sleef_cosd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_cosd1_u10purec(double);
+IMPORT CONST Sleef_double_2 Sleef_sincosd1_u10purec(double);
+IMPORT CONST Sleef_double_2 Sleef_cinz_sincosd1_u10purec(double);
+IMPORT CONST double Sleef_tand1_u10purec(double);
+IMPORT CONST double Sleef_cinz_tand1_u10purec(double);
+IMPORT CONST double Sleef_asind1_u10purec(double);
+IMPORT CONST double Sleef_cinz_asind1_u10purec(double);
+IMPORT CONST double Sleef_acosd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_acosd1_u10purec(double);
+IMPORT CONST double Sleef_atand1_u10purec(double);
+IMPORT CONST double Sleef_cinz_atand1_u10purec(double);
+IMPORT CONST double Sleef_atan2d1_u10purec(double, double);
+IMPORT CONST double Sleef_cinz_atan2d1_u10purec(double, double);
+IMPORT CONST double Sleef_logd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_logd1_u10purec(double);
+IMPORT CONST double Sleef_cbrtd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_cbrtd1_u10purec(double);
+IMPORT CONST double Sleef_expd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_expd1_u10purec(double);
+IMPORT CONST double Sleef_powd1_u10purec(double, double);
+IMPORT CONST double Sleef_cinz_powd1_u10purec(double, double);
+IMPORT CONST double Sleef_sinhd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_sinhd1_u10purec(double);
+IMPORT CONST double Sleef_coshd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_coshd1_u10purec(double);
+IMPORT CONST double Sleef_tanhd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_tanhd1_u10purec(double);
+IMPORT CONST double Sleef_sinhd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_sinhd1_u35purec(double);
+IMPORT CONST double Sleef_coshd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_coshd1_u35purec(double);
+IMPORT CONST double Sleef_tanhd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_tanhd1_u35purec(double);
+IMPORT CONST double Sleef_fastsind1_u3500purec(double);
+IMPORT CONST double Sleef_cinz_fastsind1_u3500purec(double);
+IMPORT CONST double Sleef_fastcosd1_u3500purec(double);
+IMPORT CONST double Sleef_cinz_fastcosd1_u3500purec(double);
+IMPORT CONST double Sleef_fastpowd1_u3500purec(double, double);
+IMPORT CONST double Sleef_cinz_fastpowd1_u3500purec(double, double);
+IMPORT CONST double Sleef_asinhd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_asinhd1_u10purec(double);
+IMPORT CONST double Sleef_acoshd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_acoshd1_u10purec(double);
+IMPORT CONST double Sleef_atanhd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_atanhd1_u10purec(double);
+IMPORT CONST double Sleef_exp2d1_u10purec(double);
+IMPORT CONST double Sleef_cinz_exp2d1_u10purec(double);
+IMPORT CONST double Sleef_exp2d1_u35purec(double);
+IMPORT CONST double Sleef_cinz_exp2d1_u35purec(double);
+IMPORT CONST double Sleef_exp10d1_u10purec(double);
+IMPORT CONST double Sleef_cinz_exp10d1_u10purec(double);
+IMPORT CONST double Sleef_exp10d1_u35purec(double);
+IMPORT CONST double Sleef_cinz_exp10d1_u35purec(double);
+IMPORT CONST double Sleef_expm1d1_u10purec(double);
+IMPORT CONST double Sleef_cinz_expm1d1_u10purec(double);
+IMPORT CONST double Sleef_log10d1_u10purec(double);
+IMPORT CONST double Sleef_cinz_log10d1_u10purec(double);
+IMPORT CONST double Sleef_log2d1_u10purec(double);
+IMPORT CONST double Sleef_cinz_log2d1_u10purec(double);
+IMPORT CONST double Sleef_log2d1_u35purec(double);
+IMPORT CONST double Sleef_cinz_log2d1_u35purec(double);
+IMPORT CONST double Sleef_log1pd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_log1pd1_u10purec(double);
+IMPORT CONST Sleef_double_2 Sleef_sincospid1_u05purec(double);
+IMPORT CONST Sleef_double_2 Sleef_cinz_sincospid1_u05purec(double);
+IMPORT CONST Sleef_double_2 Sleef_sincospid1_u35purec(double);
+IMPORT CONST Sleef_double_2 Sleef_cinz_sincospid1_u35purec(double);
+IMPORT CONST double Sleef_sinpid1_u05purec(double);
+IMPORT CONST double Sleef_cinz_sinpid1_u05purec(double);
+IMPORT CONST double Sleef_cospid1_u05purec(double);
+IMPORT CONST double Sleef_cinz_cospid1_u05purec(double);
+IMPORT CONST double Sleef_ldexpd1_purec(double, int32_t);
+IMPORT CONST double Sleef_cinz_ldexpd1_purec(double, int32_t);
+IMPORT CONST int32_t Sleef_ilogbd1_purec(double);
+IMPORT CONST int32_t Sleef_cinz_ilogbd1_purec(double);
+IMPORT CONST double Sleef_fmad1_purec(double, double, double);
+IMPORT CONST double Sleef_cinz_fmad1_purec(double, double, double);
+IMPORT CONST double Sleef_sqrtd1_purec(double);
+IMPORT CONST double Sleef_cinz_sqrtd1_purec(double);
+IMPORT CONST double Sleef_sqrtd1_u05purec(double);
+IMPORT CONST double Sleef_cinz_sqrtd1_u05purec(double);
+IMPORT CONST double Sleef_sqrtd1_u35purec(double);
+IMPORT CONST double Sleef_cinz_sqrtd1_u35purec(double);
+IMPORT CONST double Sleef_hypotd1_u05purec(double, double);
+IMPORT CONST double Sleef_cinz_hypotd1_u05purec(double, double);
+IMPORT CONST double Sleef_hypotd1_u35purec(double, double);
+IMPORT CONST double Sleef_cinz_hypotd1_u35purec(double, double);
+IMPORT CONST double Sleef_fabsd1_purec(double);
+IMPORT CONST double Sleef_cinz_fabsd1_purec(double);
+IMPORT CONST double Sleef_copysignd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_copysignd1_purec(double, double);
+IMPORT CONST double Sleef_fmaxd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_fmaxd1_purec(double, double);
+IMPORT CONST double Sleef_fmind1_purec(double, double);
+IMPORT CONST double Sleef_cinz_fmind1_purec(double, double);
+IMPORT CONST double Sleef_fdimd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_fdimd1_purec(double, double);
+IMPORT CONST double Sleef_truncd1_purec(double);
+IMPORT CONST double Sleef_cinz_truncd1_purec(double);
+IMPORT CONST double Sleef_floord1_purec(double);
+IMPORT CONST double Sleef_cinz_floord1_purec(double);
+IMPORT CONST double Sleef_ceild1_purec(double);
+IMPORT CONST double Sleef_cinz_ceild1_purec(double);
+IMPORT CONST double Sleef_roundd1_purec(double);
+IMPORT CONST double Sleef_cinz_roundd1_purec(double);
+IMPORT CONST double Sleef_rintd1_purec(double);
+IMPORT CONST double Sleef_cinz_rintd1_purec(double);
+IMPORT CONST double Sleef_nextafterd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_nextafterd1_purec(double, double);
+IMPORT CONST double Sleef_frfrexpd1_purec(double);
+IMPORT CONST double Sleef_cinz_frfrexpd1_purec(double);
+IMPORT CONST int32_t Sleef_expfrexpd1_purec(double);
+IMPORT CONST int32_t Sleef_cinz_expfrexpd1_purec(double);
+IMPORT CONST double Sleef_fmodd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_fmodd1_purec(double, double);
+IMPORT CONST double Sleef_remainderd1_purec(double, double);
+IMPORT CONST double Sleef_cinz_remainderd1_purec(double, double);
+IMPORT CONST Sleef_double_2 Sleef_modfd1_purec(double);
+IMPORT CONST Sleef_double_2 Sleef_cinz_modfd1_purec(double);
+IMPORT CONST double Sleef_lgammad1_u10purec(double);
+IMPORT CONST double Sleef_cinz_lgammad1_u10purec(double);
+IMPORT CONST double Sleef_tgammad1_u10purec(double);
+IMPORT CONST double Sleef_cinz_tgammad1_u10purec(double);
+IMPORT CONST double Sleef_erfd1_u10purec(double);
+IMPORT CONST double Sleef_cinz_erfd1_u10purec(double);
+IMPORT CONST double Sleef_erfcd1_u15purec(double);
+IMPORT CONST double Sleef_cinz_erfcd1_u15purec(double);
+IMPORT CONST int Sleef_getIntd1_purec(int);
+IMPORT CONST void *Sleef_getPtrd1_purec(int);
+
+#ifndef Sleef_float_2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float_2;
+#define Sleef_float_2_DEFINED
+#endif
+
+IMPORT CONST float Sleef_sinf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_sinf1_u35purec(float);
+IMPORT CONST float Sleef_cosf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_cosf1_u35purec(float);
+IMPORT CONST Sleef_float_2 Sleef_sincosf1_u35purec(float);
+IMPORT CONST Sleef_float_2 Sleef_cinz_sincosf1_u35purec(float);
+IMPORT CONST float Sleef_tanf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_tanf1_u35purec(float);
+IMPORT CONST float Sleef_asinf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_asinf1_u35purec(float);
+IMPORT CONST float Sleef_acosf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_acosf1_u35purec(float);
+IMPORT CONST float Sleef_atanf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_atanf1_u35purec(float);
+IMPORT CONST float Sleef_atan2f1_u35purec(float, float);
+IMPORT CONST float Sleef_cinz_atan2f1_u35purec(float, float);
+IMPORT CONST float Sleef_logf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_logf1_u35purec(float);
+IMPORT CONST float Sleef_cbrtf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_cbrtf1_u35purec(float);
+IMPORT CONST float Sleef_sinf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_sinf1_u10purec(float);
+IMPORT CONST float Sleef_cosf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_cosf1_u10purec(float);
+IMPORT CONST Sleef_float_2 Sleef_sincosf1_u10purec(float);
+IMPORT CONST Sleef_float_2 Sleef_cinz_sincosf1_u10purec(float);
+IMPORT CONST float Sleef_tanf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_tanf1_u10purec(float);
+IMPORT CONST float Sleef_asinf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_asinf1_u10purec(float);
+IMPORT CONST float Sleef_acosf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_acosf1_u10purec(float);
+IMPORT CONST float Sleef_atanf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_atanf1_u10purec(float);
+IMPORT CONST float Sleef_atan2f1_u10purec(float, float);
+IMPORT CONST float Sleef_cinz_atan2f1_u10purec(float, float);
+IMPORT CONST float Sleef_logf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_logf1_u10purec(float);
+IMPORT CONST float Sleef_cbrtf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_cbrtf1_u10purec(float);
+IMPORT CONST float Sleef_expf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_expf1_u10purec(float);
+IMPORT CONST float Sleef_powf1_u10purec(float, float);
+IMPORT CONST float Sleef_cinz_powf1_u10purec(float, float);
+IMPORT CONST float Sleef_sinhf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_sinhf1_u10purec(float);
+IMPORT CONST float Sleef_coshf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_coshf1_u10purec(float);
+IMPORT CONST float Sleef_tanhf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_tanhf1_u10purec(float);
+IMPORT CONST float Sleef_sinhf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_sinhf1_u35purec(float);
+IMPORT CONST float Sleef_coshf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_coshf1_u35purec(float);
+IMPORT CONST float Sleef_tanhf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_tanhf1_u35purec(float);
+IMPORT CONST float Sleef_fastsinf1_u3500purec(float);
+IMPORT CONST float Sleef_cinz_fastsinf1_u3500purec(float);
+IMPORT CONST float Sleef_fastcosf1_u3500purec(float);
+IMPORT CONST float Sleef_cinz_fastcosf1_u3500purec(float);
+IMPORT CONST float Sleef_fastpowf1_u3500purec(float, float);
+IMPORT CONST float Sleef_cinz_fastpowf1_u3500purec(float, float);
+IMPORT CONST float Sleef_asinhf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_asinhf1_u10purec(float);
+IMPORT CONST float Sleef_acoshf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_acoshf1_u10purec(float);
+IMPORT CONST float Sleef_atanhf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_atanhf1_u10purec(float);
+IMPORT CONST float Sleef_exp2f1_u10purec(float);
+IMPORT CONST float Sleef_cinz_exp2f1_u10purec(float);
+IMPORT CONST float Sleef_exp2f1_u35purec(float);
+IMPORT CONST float Sleef_cinz_exp2f1_u35purec(float);
+IMPORT CONST float Sleef_exp10f1_u10purec(float);
+IMPORT CONST float Sleef_cinz_exp10f1_u10purec(float);
+IMPORT CONST float Sleef_exp10f1_u35purec(float);
+IMPORT CONST float Sleef_cinz_exp10f1_u35purec(float);
+IMPORT CONST float Sleef_expm1f1_u10purec(float);
+IMPORT CONST float Sleef_cinz_expm1f1_u10purec(float);
+IMPORT CONST float Sleef_log10f1_u10purec(float);
+IMPORT CONST float Sleef_cinz_log10f1_u10purec(float);
+IMPORT CONST float Sleef_log2f1_u10purec(float);
+IMPORT CONST float Sleef_cinz_log2f1_u10purec(float);
+IMPORT CONST float Sleef_log2f1_u35purec(float);
+IMPORT CONST float Sleef_cinz_log2f1_u35purec(float);
+IMPORT CONST float Sleef_log1pf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_log1pf1_u10purec(float);
+IMPORT CONST Sleef_float_2 Sleef_sincospif1_u05purec(float);
+IMPORT CONST Sleef_float_2 Sleef_cinz_sincospif1_u05purec(float);
+IMPORT CONST Sleef_float_2 Sleef_sincospif1_u35purec(float);
+IMPORT CONST Sleef_float_2 Sleef_cinz_sincospif1_u35purec(float);
+IMPORT CONST float Sleef_sinpif1_u05purec(float);
+IMPORT CONST float Sleef_cinz_sinpif1_u05purec(float);
+IMPORT CONST float Sleef_cospif1_u05purec(float);
+IMPORT CONST float Sleef_cinz_cospif1_u05purec(float);
+IMPORT CONST float Sleef_fmaf1_purec(float, float, float);
+IMPORT CONST float Sleef_cinz_fmaf1_purec(float, float, float);
+IMPORT CONST float Sleef_sqrtf1_purec(float);
+IMPORT CONST float Sleef_cinz_sqrtf1_purec(float);
+IMPORT CONST float Sleef_sqrtf1_u05purec(float);
+IMPORT CONST float Sleef_cinz_sqrtf1_u05purec(float);
+IMPORT CONST float Sleef_sqrtf1_u35purec(float);
+IMPORT CONST float Sleef_cinz_sqrtf1_u35purec(float);
+IMPORT CONST float Sleef_hypotf1_u05purec(float, float);
+IMPORT CONST float Sleef_cinz_hypotf1_u05purec(float, float);
+IMPORT CONST float Sleef_hypotf1_u35purec(float, float);
+IMPORT CONST float Sleef_cinz_hypotf1_u35purec(float, float);
+IMPORT CONST float Sleef_fabsf1_purec(float);
+IMPORT CONST float Sleef_cinz_fabsf1_purec(float);
+IMPORT CONST float Sleef_copysignf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_copysignf1_purec(float, float);
+IMPORT CONST float Sleef_fmaxf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_fmaxf1_purec(float, float);
+IMPORT CONST float Sleef_fminf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_fminf1_purec(float, float);
+IMPORT CONST float Sleef_fdimf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_fdimf1_purec(float, float);
+IMPORT CONST float Sleef_truncf1_purec(float);
+IMPORT CONST float Sleef_cinz_truncf1_purec(float);
+IMPORT CONST float Sleef_floorf1_purec(float);
+IMPORT CONST float Sleef_cinz_floorf1_purec(float);
+IMPORT CONST float Sleef_ceilf1_purec(float);
+IMPORT CONST float Sleef_cinz_ceilf1_purec(float);
+IMPORT CONST float Sleef_roundf1_purec(float);
+IMPORT CONST float Sleef_cinz_roundf1_purec(float);
+IMPORT CONST float Sleef_rintf1_purec(float);
+IMPORT CONST float Sleef_cinz_rintf1_purec(float);
+IMPORT CONST float Sleef_nextafterf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_nextafterf1_purec(float, float);
+IMPORT CONST float Sleef_frfrexpf1_purec(float);
+IMPORT CONST float Sleef_cinz_frfrexpf1_purec(float);
+IMPORT CONST float Sleef_fmodf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_fmodf1_purec(float, float);
+IMPORT CONST float Sleef_remainderf1_purec(float, float);
+IMPORT CONST float Sleef_cinz_remainderf1_purec(float, float);
+IMPORT CONST Sleef_float_2 Sleef_modff1_purec(float);
+IMPORT CONST Sleef_float_2 Sleef_cinz_modff1_purec(float);
+IMPORT CONST float Sleef_lgammaf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_lgammaf1_u10purec(float);
+IMPORT CONST float Sleef_tgammaf1_u10purec(float);
+IMPORT CONST float Sleef_cinz_tgammaf1_u10purec(float);
+IMPORT CONST float Sleef_erff1_u10purec(float);
+IMPORT CONST float Sleef_cinz_erff1_u10purec(float);
+IMPORT CONST float Sleef_erfcf1_u15purec(float);
+IMPORT CONST float Sleef_cinz_erfcf1_u15purec(float);
+IMPORT CONST int Sleef_getIntf1_purec(int);
+IMPORT CONST int Sleef_cinz_getIntf1_purec(int);
+IMPORT CONST void *Sleef_getPtrf1_purec(int);
+IMPORT CONST void *Sleef_cinz_getPtrf1_purec(int);
+#endif
+#ifdef FP_FAST_FMA
+
+#ifndef Sleef_double_2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double_2;
+#define Sleef_double_2_DEFINED
+#endif
+
+IMPORT CONST double Sleef_sind1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_sind1_u35purecfma(double);
+IMPORT CONST double Sleef_cosd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_cosd1_u35purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_sincosd1_u35purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_finz_sincosd1_u35purecfma(double);
+IMPORT CONST double Sleef_tand1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_tand1_u35purecfma(double);
+IMPORT CONST double Sleef_asind1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_asind1_u35purecfma(double);
+IMPORT CONST double Sleef_acosd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_acosd1_u35purecfma(double);
+IMPORT CONST double Sleef_atand1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_atand1_u35purecfma(double);
+IMPORT CONST double Sleef_atan2d1_u35purecfma(double, double);
+IMPORT CONST double Sleef_finz_atan2d1_u35purecfma(double, double);
+IMPORT CONST double Sleef_logd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_logd1_u35purecfma(double);
+IMPORT CONST double Sleef_cbrtd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_cbrtd1_u35purecfma(double);
+IMPORT CONST double Sleef_sind1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_sind1_u10purecfma(double);
+IMPORT CONST double Sleef_cosd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_cosd1_u10purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_sincosd1_u10purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_finz_sincosd1_u10purecfma(double);
+IMPORT CONST double Sleef_tand1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_tand1_u10purecfma(double);
+IMPORT CONST double Sleef_asind1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_asind1_u10purecfma(double);
+IMPORT CONST double Sleef_acosd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_acosd1_u10purecfma(double);
+IMPORT CONST double Sleef_atand1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_atand1_u10purecfma(double);
+IMPORT CONST double Sleef_atan2d1_u10purecfma(double, double);
+IMPORT CONST double Sleef_finz_atan2d1_u10purecfma(double, double);
+IMPORT CONST double Sleef_logd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_logd1_u10purecfma(double);
+IMPORT CONST double Sleef_cbrtd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_cbrtd1_u10purecfma(double);
+IMPORT CONST double Sleef_expd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_expd1_u10purecfma(double);
+IMPORT CONST double Sleef_powd1_u10purecfma(double, double);
+IMPORT CONST double Sleef_finz_powd1_u10purecfma(double, double);
+IMPORT CONST double Sleef_sinhd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_sinhd1_u10purecfma(double);
+IMPORT CONST double Sleef_coshd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_coshd1_u10purecfma(double);
+IMPORT CONST double Sleef_tanhd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_tanhd1_u10purecfma(double);
+IMPORT CONST double Sleef_sinhd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_sinhd1_u35purecfma(double);
+IMPORT CONST double Sleef_coshd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_coshd1_u35purecfma(double);
+IMPORT CONST double Sleef_tanhd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_tanhd1_u35purecfma(double);
+IMPORT CONST double Sleef_fastsind1_u3500purecfma(double);
+IMPORT CONST double Sleef_finz_fastsind1_u3500purecfma(double);
+IMPORT CONST double Sleef_fastcosd1_u3500purecfma(double);
+IMPORT CONST double Sleef_finz_fastcosd1_u3500purecfma(double);
+IMPORT CONST double Sleef_fastpowd1_u3500purecfma(double, double);
+IMPORT CONST double Sleef_finz_fastpowd1_u3500purecfma(double, double);
+IMPORT CONST double Sleef_asinhd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_asinhd1_u10purecfma(double);
+IMPORT CONST double Sleef_acoshd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_acoshd1_u10purecfma(double);
+IMPORT CONST double Sleef_atanhd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_atanhd1_u10purecfma(double);
+IMPORT CONST double Sleef_exp2d1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_exp2d1_u10purecfma(double);
+IMPORT CONST double Sleef_exp2d1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_exp2d1_u35purecfma(double);
+IMPORT CONST double Sleef_exp10d1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_exp10d1_u10purecfma(double);
+IMPORT CONST double Sleef_exp10d1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_exp10d1_u35purecfma(double);
+IMPORT CONST double Sleef_expm1d1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_expm1d1_u10purecfma(double);
+IMPORT CONST double Sleef_log10d1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_log10d1_u10purecfma(double);
+IMPORT CONST double Sleef_log2d1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_log2d1_u10purecfma(double);
+IMPORT CONST double Sleef_log2d1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_log2d1_u35purecfma(double);
+IMPORT CONST double Sleef_log1pd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_log1pd1_u10purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_sincospid1_u05purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_finz_sincospid1_u05purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_sincospid1_u35purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_finz_sincospid1_u35purecfma(double);
+IMPORT CONST double Sleef_sinpid1_u05purecfma(double);
+IMPORT CONST double Sleef_finz_sinpid1_u05purecfma(double);
+IMPORT CONST double Sleef_cospid1_u05purecfma(double);
+IMPORT CONST double Sleef_finz_cospid1_u05purecfma(double);
+IMPORT CONST double Sleef_ldexpd1_purecfma(double, int32_t);
+IMPORT CONST double Sleef_finz_ldexpd1_purecfma(double, int32_t);
+IMPORT CONST int32_t Sleef_ilogbd1_purecfma(double);
+IMPORT CONST int32_t Sleef_finz_ilogbd1_purecfma(double);
+IMPORT CONST double Sleef_fmad1_purecfma(double, double, double);
+IMPORT CONST double Sleef_finz_fmad1_purecfma(double, double, double);
+IMPORT CONST double Sleef_sqrtd1_purecfma(double);
+IMPORT CONST double Sleef_finz_sqrtd1_purecfma(double);
+IMPORT CONST double Sleef_sqrtd1_u05purecfma(double);
+IMPORT CONST double Sleef_finz_sqrtd1_u05purecfma(double);
+IMPORT CONST double Sleef_sqrtd1_u35purecfma(double);
+IMPORT CONST double Sleef_finz_sqrtd1_u35purecfma(double);
+IMPORT CONST double Sleef_hypotd1_u05purecfma(double, double);
+IMPORT CONST double Sleef_finz_hypotd1_u05purecfma(double, double);
+IMPORT CONST double Sleef_hypotd1_u35purecfma(double, double);
+IMPORT CONST double Sleef_finz_hypotd1_u35purecfma(double, double);
+IMPORT CONST double Sleef_fabsd1_purecfma(double);
+IMPORT CONST double Sleef_finz_fabsd1_purecfma(double);
+IMPORT CONST double Sleef_copysignd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_copysignd1_purecfma(double, double);
+IMPORT CONST double Sleef_fmaxd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_fmaxd1_purecfma(double, double);
+IMPORT CONST double Sleef_fmind1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_fmind1_purecfma(double, double);
+IMPORT CONST double Sleef_fdimd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_fdimd1_purecfma(double, double);
+IMPORT CONST double Sleef_truncd1_purecfma(double);
+IMPORT CONST double Sleef_finz_truncd1_purecfma(double);
+IMPORT CONST double Sleef_floord1_purecfma(double);
+IMPORT CONST double Sleef_finz_floord1_purecfma(double);
+IMPORT CONST double Sleef_ceild1_purecfma(double);
+IMPORT CONST double Sleef_finz_ceild1_purecfma(double);
+IMPORT CONST double Sleef_roundd1_purecfma(double);
+IMPORT CONST double Sleef_finz_roundd1_purecfma(double);
+IMPORT CONST double Sleef_rintd1_purecfma(double);
+IMPORT CONST double Sleef_finz_rintd1_purecfma(double);
+IMPORT CONST double Sleef_nextafterd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_nextafterd1_purecfma(double, double);
+IMPORT CONST double Sleef_frfrexpd1_purecfma(double);
+IMPORT CONST double Sleef_finz_frfrexpd1_purecfma(double);
+IMPORT CONST int32_t Sleef_expfrexpd1_purecfma(double);
+IMPORT CONST int32_t Sleef_finz_expfrexpd1_purecfma(double);
+IMPORT CONST double Sleef_fmodd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_fmodd1_purecfma(double, double);
+IMPORT CONST double Sleef_remainderd1_purecfma(double, double);
+IMPORT CONST double Sleef_finz_remainderd1_purecfma(double, double);
+IMPORT CONST Sleef_double_2 Sleef_modfd1_purecfma(double);
+IMPORT CONST Sleef_double_2 Sleef_finz_modfd1_purecfma(double);
+IMPORT CONST double Sleef_lgammad1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_lgammad1_u10purecfma(double);
+IMPORT CONST double Sleef_tgammad1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_tgammad1_u10purecfma(double);
+IMPORT CONST double Sleef_erfd1_u10purecfma(double);
+IMPORT CONST double Sleef_finz_erfd1_u10purecfma(double);
+IMPORT CONST double Sleef_erfcd1_u15purecfma(double);
+IMPORT CONST double Sleef_finz_erfcd1_u15purecfma(double);
+IMPORT CONST int Sleef_getIntd1_purecfma(int);
+IMPORT CONST void *Sleef_getPtrd1_purecfma(int);
+
+#ifndef Sleef_float_2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float_2;
+#define Sleef_float_2_DEFINED
+#endif
+
+IMPORT CONST float Sleef_sinf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_sinf1_u35purecfma(float);
+IMPORT CONST float Sleef_cosf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_cosf1_u35purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_sincosf1_u35purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_finz_sincosf1_u35purecfma(float);
+IMPORT CONST float Sleef_tanf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_tanf1_u35purecfma(float);
+IMPORT CONST float Sleef_asinf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_asinf1_u35purecfma(float);
+IMPORT CONST float Sleef_acosf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_acosf1_u35purecfma(float);
+IMPORT CONST float Sleef_atanf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_atanf1_u35purecfma(float);
+IMPORT CONST float Sleef_atan2f1_u35purecfma(float, float);
+IMPORT CONST float Sleef_finz_atan2f1_u35purecfma(float, float);
+IMPORT CONST float Sleef_logf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_logf1_u35purecfma(float);
+IMPORT CONST float Sleef_cbrtf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_cbrtf1_u35purecfma(float);
+IMPORT CONST float Sleef_sinf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_sinf1_u10purecfma(float);
+IMPORT CONST float Sleef_cosf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_cosf1_u10purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_sincosf1_u10purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_finz_sincosf1_u10purecfma(float);
+IMPORT CONST float Sleef_tanf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_tanf1_u10purecfma(float);
+IMPORT CONST float Sleef_asinf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_asinf1_u10purecfma(float);
+IMPORT CONST float Sleef_acosf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_acosf1_u10purecfma(float);
+IMPORT CONST float Sleef_atanf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_atanf1_u10purecfma(float);
+IMPORT CONST float Sleef_atan2f1_u10purecfma(float, float);
+IMPORT CONST float Sleef_finz_atan2f1_u10purecfma(float, float);
+IMPORT CONST float Sleef_logf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_logf1_u10purecfma(float);
+IMPORT CONST float Sleef_cbrtf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_cbrtf1_u10purecfma(float);
+IMPORT CONST float Sleef_expf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_expf1_u10purecfma(float);
+IMPORT CONST float Sleef_powf1_u10purecfma(float, float);
+IMPORT CONST float Sleef_finz_powf1_u10purecfma(float, float);
+IMPORT CONST float Sleef_sinhf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_sinhf1_u10purecfma(float);
+IMPORT CONST float Sleef_coshf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_coshf1_u10purecfma(float);
+IMPORT CONST float Sleef_tanhf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_tanhf1_u10purecfma(float);
+IMPORT CONST float Sleef_sinhf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_sinhf1_u35purecfma(float);
+IMPORT CONST float Sleef_coshf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_coshf1_u35purecfma(float);
+IMPORT CONST float Sleef_tanhf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_tanhf1_u35purecfma(float);
+IMPORT CONST float Sleef_fastsinf1_u3500purecfma(float);
+IMPORT CONST float Sleef_finz_fastsinf1_u3500purecfma(float);
+IMPORT CONST float Sleef_fastcosf1_u3500purecfma(float);
+IMPORT CONST float Sleef_finz_fastcosf1_u3500purecfma(float);
+IMPORT CONST float Sleef_fastpowf1_u3500purecfma(float, float);
+IMPORT CONST float Sleef_finz_fastpowf1_u3500purecfma(float, float);
+IMPORT CONST float Sleef_asinhf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_asinhf1_u10purecfma(float);
+IMPORT CONST float Sleef_acoshf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_acoshf1_u10purecfma(float);
+IMPORT CONST float Sleef_atanhf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_atanhf1_u10purecfma(float);
+IMPORT CONST float Sleef_exp2f1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_exp2f1_u10purecfma(float);
+IMPORT CONST float Sleef_exp2f1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_exp2f1_u35purecfma(float);
+IMPORT CONST float Sleef_exp10f1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_exp10f1_u10purecfma(float);
+IMPORT CONST float Sleef_exp10f1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_exp10f1_u35purecfma(float);
+IMPORT CONST float Sleef_expm1f1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_expm1f1_u10purecfma(float);
+IMPORT CONST float Sleef_log10f1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_log10f1_u10purecfma(float);
+IMPORT CONST float Sleef_log2f1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_log2f1_u10purecfma(float);
+IMPORT CONST float Sleef_log2f1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_log2f1_u35purecfma(float);
+IMPORT CONST float Sleef_log1pf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_log1pf1_u10purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_sincospif1_u05purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_finz_sincospif1_u05purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_sincospif1_u35purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_finz_sincospif1_u35purecfma(float);
+IMPORT CONST float Sleef_sinpif1_u05purecfma(float);
+IMPORT CONST float Sleef_finz_sinpif1_u05purecfma(float);
+IMPORT CONST float Sleef_cospif1_u05purecfma(float);
+IMPORT CONST float Sleef_finz_cospif1_u05purecfma(float);
+IMPORT CONST float Sleef_fmaf1_purecfma(float, float, float);
+IMPORT CONST float Sleef_finz_fmaf1_purecfma(float, float, float);
+IMPORT CONST float Sleef_sqrtf1_purecfma(float);
+IMPORT CONST float Sleef_finz_sqrtf1_purecfma(float);
+IMPORT CONST float Sleef_sqrtf1_u05purecfma(float);
+IMPORT CONST float Sleef_finz_sqrtf1_u05purecfma(float);
+IMPORT CONST float Sleef_sqrtf1_u35purecfma(float);
+IMPORT CONST float Sleef_finz_sqrtf1_u35purecfma(float);
+IMPORT CONST float Sleef_hypotf1_u05purecfma(float, float);
+IMPORT CONST float Sleef_finz_hypotf1_u05purecfma(float, float);
+IMPORT CONST float Sleef_hypotf1_u35purecfma(float, float);
+IMPORT CONST float Sleef_finz_hypotf1_u35purecfma(float, float);
+IMPORT CONST float Sleef_fabsf1_purecfma(float);
+IMPORT CONST float Sleef_finz_fabsf1_purecfma(float);
+IMPORT CONST float Sleef_copysignf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_copysignf1_purecfma(float, float);
+IMPORT CONST float Sleef_fmaxf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_fmaxf1_purecfma(float, float);
+IMPORT CONST float Sleef_fminf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_fminf1_purecfma(float, float);
+IMPORT CONST float Sleef_fdimf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_fdimf1_purecfma(float, float);
+IMPORT CONST float Sleef_truncf1_purecfma(float);
+IMPORT CONST float Sleef_finz_truncf1_purecfma(float);
+IMPORT CONST float Sleef_floorf1_purecfma(float);
+IMPORT CONST float Sleef_finz_floorf1_purecfma(float);
+IMPORT CONST float Sleef_ceilf1_purecfma(float);
+IMPORT CONST float Sleef_finz_ceilf1_purecfma(float);
+IMPORT CONST float Sleef_roundf1_purecfma(float);
+IMPORT CONST float Sleef_finz_roundf1_purecfma(float);
+IMPORT CONST float Sleef_rintf1_purecfma(float);
+IMPORT CONST float Sleef_finz_rintf1_purecfma(float);
+IMPORT CONST float Sleef_nextafterf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_nextafterf1_purecfma(float, float);
+IMPORT CONST float Sleef_frfrexpf1_purecfma(float);
+IMPORT CONST float Sleef_finz_frfrexpf1_purecfma(float);
+IMPORT CONST float Sleef_fmodf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_fmodf1_purecfma(float, float);
+IMPORT CONST float Sleef_remainderf1_purecfma(float, float);
+IMPORT CONST float Sleef_finz_remainderf1_purecfma(float, float);
+IMPORT CONST Sleef_float_2 Sleef_modff1_purecfma(float);
+IMPORT CONST Sleef_float_2 Sleef_finz_modff1_purecfma(float);
+IMPORT CONST float Sleef_lgammaf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_lgammaf1_u10purecfma(float);
+IMPORT CONST float Sleef_tgammaf1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_tgammaf1_u10purecfma(float);
+IMPORT CONST float Sleef_erff1_u10purecfma(float);
+IMPORT CONST float Sleef_finz_erff1_u10purecfma(float);
+IMPORT CONST float Sleef_erfcf1_u15purecfma(float);
+IMPORT CONST float Sleef_finz_erfcf1_u15purecfma(float);
+IMPORT CONST int Sleef_getIntf1_purecfma(int);
+IMPORT CONST int Sleef_finz_getIntf1_purecfma(int);
+IMPORT CONST void *Sleef_getPtrf1_purecfma(int);
+IMPORT CONST void *Sleef_finz_getPtrf1_purecfma(int);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#undef IMPORT
+#endif // #ifndef __SLEEF_H__